Index: conf/parse-plugins.xml =================================================================== --- conf/parse-plugins.xml (revision 414852) +++ conf/parse-plugins.xml (working copy) @@ -36,8 +36,7 @@ - - + Index: src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java =================================================================== --- src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (revision 414852) +++ src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (working copy) @@ -154,6 +154,7 @@ } } } + System.out.println("OPTIMIZE: " + query.toString()); if (sortField == null && !reverse) { // no hit limit Index: src/java/org/apache/nutch/searcher/NutchBean.java =================================================================== --- src/java/org/apache/nutch/searcher/NutchBean.java (revision 414852) +++ src/java/org/apache/nutch/searcher/NutchBean.java (working copy) @@ -134,9 +134,11 @@ } LOG.info("opening segments in " + segmentsDir); - FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf); - + FetchedSegments segments = + new FetchedSegments(this.fs, segmentsDir.toString(), this.conf); + System.out.println("SEGMENTS " + segments); this.segmentNames = segments.getSegmentNames(); + System.out.println("SEGMENTS " + this.segmentNames); this.searcher = indexSearcher; this.detailer = indexSearcher; Index: src/java/org/apache/nutch/searcher/IndexSearcher.java =================================================================== --- src/java/org/apache/nutch/searcher/IndexSearcher.java (revision 414852) +++ src/java/org/apache/nutch/searcher/IndexSearcher.java (working copy) @@ -57,6 +57,7 @@ this.conf = conf; this.fs = FileSystem.get(conf); for (int i = 0; i < indexDirs.length; i++) { + System.out.println("READER " + indexDirs[i]); readers[i] = IndexReader.open(getDirectory(indexDirs[i])); } init(new MultiReader(readers), conf); @@ -92,6 +93,7 @@ throws IOException { org.apache.lucene.search.BooleanQuery luceneQuery = this.queryFilters.filter(query); + System.out.println("QUERY: " + luceneQuery.toString()); return translateHits (optimizer.optimize(luceneQuery, luceneSearcher, numHits, sortField, reverse), Index: src/java/org/apache/nutch/searcher/OpenSearchServlet.java =================================================================== --- src/java/org/apache/nutch/searcher/OpenSearchServlet.java (revision 414852) +++ src/java/org/apache/nutch/searcher/OpenSearchServlet.java (working copy) @@ -262,23 +262,64 @@ private static void addNode(Document doc, Node parent, String name, String text) { Element child = doc.createElement(name); - child.appendChild(doc.createTextNode(text)); + child.appendChild(doc.createTextNode(getLegalXml(text))); parent.appendChild(child); } private static void addNode(Document doc, Node parent, String ns, String name, String text) { Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); - child.appendChild(doc.createTextNode(text)); + child.appendChild(doc.createTextNode(getLegalXml(text))); parent.appendChild(child); } private static void addAttribute(Document doc, Element node, String name, String value) { Attr attribute = doc.createAttribute(name); - attribute.setValue(value); + attribute.setValue(getLegalXml(getLegalXml(value))); node.getAttributes().setNamedItem(attribute); } + /* + * Ensure string is legal xml. + * @param text String to verify. + * @return Passed text or a new string with illegal + * characters removed if any found in text. + * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char + */ + private static String getLegalXml(final String text) { + if (text == null) { + return null; + } + StringBuffer buffer = null; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (!isLegalXml(c)) { + if (buffer == null) { + // Start up a buffer. Copy characters here from now on + // now we've found at least one bad character in original. + buffer = new StringBuffer(text.length()); + buffer.append(text.substring(0, i)); + } + } else { + if (buffer != null) { + buffer.append(c); + } + } + } + return (buffer != null)? buffer.toString(): text; + } + + private static boolean isLegalXml(final char c) { + return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); + } + + public static void main(final String [] args) { + // Test that our isLegalXml works. + System.out.println(getLegalXml("hello")); + System.out.println(getLegalXml("he\u0000llo")); + System.out.println(getLegalXml("\u0000he\u0000llo")); + System.out.println(getLegalXml("\u0000he\u0000llo\u0000")); + } } - Index: src/java/org/apache/nutch/searcher/QueryFilters.java =================================================================== --- src/java/org/apache/nutch/searcher/QueryFilters.java (revision 414852) +++ src/java/org/apache/nutch/searcher/QueryFilters.java (working copy) @@ -103,6 +103,7 @@ // then run each plugin BooleanQuery output = new BooleanQuery(); for (int i = 0; i < this.queryFilters.length; i++) { + System.out.println("FILTER: " + this.queryFilters[i]); output = this.queryFilters[i].filter(input, output); } return output; Index: src/java/org/apache/nutch/parse/ParserFactory.java =================================================================== --- src/java/org/apache/nutch/parse/ParserFactory.java (revision 414852) +++ src/java/org/apache/nutch/parse/ParserFactory.java (working copy) @@ -357,6 +357,9 @@ } private boolean match(Extension extension, String id, String type) { + System.out.println("EXT " + extension.getId() + " " + + extension.getAttribute("contentType")); + return ((id.equals(extension.getId())) && (type.equals(extension.getAttribute("contentType")) || type.equals(DEFAULT_PLUGIN))); @@ -364,7 +367,9 @@ /** Get an extension from its id and supported content-type. */ private Extension getExtension(Extension[] list, String id, String type) { + System.out.println("ID " + id + " TYPE " + type); for (int i=0; i