*** nutch.bak/src/java/org/apache/nutch/net/URLFilters.java Thu Oct 12 06:48:42 2006 --- nutch/src/java/org/apache/nutch/net/URLFilters.java Sat Nov 25 13:22:58 2006 *************** *** 76,85 **** /** Run all defined filters. Assume logical AND. */ public String filter(String urlString) throws URLFilterException { for (int i = 0; i < this.filters.length; i++) { ! if (urlString == null) return null; ! urlString = this.filters[i].filter(urlString); } return urlString; } --- 76,94 ---- /** Run all defined filters. Assume logical AND. */ public String filter(String urlString) throws URLFilterException { + String tmp; for (int i = 0; i < this.filters.length; i++) { ! tmp = this.filters[i].filter(urlString); ! if (tmp == null) { return null; ! } ! else if (tmp.equals("_PASS_")) { ! // "_PASS_" is a magic cookie that short-circuits the remaining tests. ! return urlString; ! } ! else { ! urlString = tmp; ! } } return urlString; } *** nutch.bak/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Thu Oct 12 06:48:42 2006 --- nutch/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Sat Nov 25 13:23:18 2006 *************** *** 14,20 **** * limitations under the License. */ ! // $Id: PrefixURLFilter.java,v 1.2 2005/02/07 19:10:37 cutting Exp $ package org.apache.nutch.urlfilter.prefix; --- 14,20 ---- * limitations under the License. */ ! //$Id: PrefixURLFilter.java,v 1.2 2005/02/07 19:10:37 cutting Exp $ package org.apache.nutch.urlfilter.prefix; *************** *** 53,60 **** // read in attribute "file" of this plugin. private static String attributeFile = null; ! private TrieStringMatcher trie; private Configuration conf; --- 53,62 ---- // read in attribute "file" of this plugin. private static String attributeFile = null; + private static String shortCircuitPrefix = "SHORTCIRCUIT:"; ! private TrieStringMatcher trie; // "regular" trie ! private TrieStringMatcher scTrie = null; // trie for short-circuited matches private Configuration conf; *************** *** 63,83 **** } public PrefixURLFilter(String filename) throws IOException { ! trie = readConfigurationFile(new FileReader(filename)); } public String filter(String url) { ! if (trie.shortestMatch(url) == null) ! return null; ! else ! return url; } ! private TrieStringMatcher readConfigurationFile(Reader reader) throws IOException { BufferedReader in=new BufferedReader(reader); List urlprefixes = new ArrayList(); String line; while((line=in.readLine())!=null) { --- 65,86 ---- } public PrefixURLFilter(String filename) throws IOException { ! readConfigurationFile(new FileReader(filename)); } public String filter(String url) { ! if (scTrie != null && scTrie.shortestMatch(url) != null) { ! return "_PASS_"; // short-circuit match ! } ! return (trie.shortestMatch(url) == null) ? null : url; } ! private void readConfigurationFile(Reader reader) throws IOException { BufferedReader in=new BufferedReader(reader); List urlprefixes = new ArrayList(); + List scurlprefixes = new ArrayList(); String line; while((line=in.readLine())!=null) { *************** *** 89,99 **** case ' ' : case '\n' : case '#' : // skip blank & comment lines continue; default : urlprefixes.add(line); } } ! return new PrefixStringMatcher(urlprefixes); } public static void main(String args[]) --- 92,116 ---- case ' ' : case '\n' : case '#' : // skip blank & comment lines continue; default : + if (shortCircuitPrefix.equals(line.subSequence(0,shortCircuitPrefix.length()))) { + // Beginning of line matches SHORTCIRCUIT: token. Make this a short circuit prefix + line = line.subSequence(shortCircuitPrefix.length(), line.length()).toString(); + if (LOG.isInfoEnabled()) { + LOG.info("adding short circuit prefix " + line); + } + scurlprefixes.add(line); + } + else { + if (LOG.isInfoEnabled()) { + LOG.info("adding regular prefix " + line); + } urlprefixes.add(line); } } + } ! trie = new PrefixStringMatcher(urlprefixes); ! scTrie = new PrefixStringMatcher(scurlprefixes); } public static void main(String args[]) *************** *** 130,164 **** } if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null; ! if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); } - } else { - // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); - // } } String file = conf.get("urlfilter.prefix.file"); // attribute "file" takes precedence if defined if (attributeFile != null) file = attributeFile; Reader reader = conf.getConfResourceAsReader(file); if (reader == null) { trie = new PrefixStringMatcher(new String[0]); } else { try { ! trie = readConfigurationFile(reader); } catch (IOException e) { if (LOG.isFatalEnabled()) { LOG.fatal(e.getMessage()); } // TODO mb@media-style.com: throw Exception? Because broken api. throw new RuntimeException(e.getMessage(), e); } } } public Configuration getConf() { --- 147,180 ---- } if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null; ! if (LOG.isInfoEnabled()) { + if (attributeFile != null) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); } } String file = conf.get("urlfilter.prefix.file"); // attribute "file" takes precedence if defined if (attributeFile != null) file = attributeFile; + Reader reader = conf.getConfResourceAsReader(file); if (reader == null) { trie = new PrefixStringMatcher(new String[0]); } else { try { ! readConfigurationFile(reader); } catch (IOException e) { if (LOG.isFatalEnabled()) { LOG.fatal(e.getMessage()); } // TODO mb@media-style.com: throw Exception? Because broken api. throw new RuntimeException(e.getMessage(), e); } } + + } public Configuration getConf() {