Index: conf/parse-plugins.xml
===================================================================
--- conf/parse-plugins.xml (revision 414852)
+++ conf/parse-plugins.xml (working copy)
@@ -36,8 +36,7 @@
-
-
+
Index: src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
===================================================================
--- src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (revision 414852)
+++ src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (working copy)
@@ -154,6 +154,7 @@
}
}
}
+ System.out.println("OPTIMIZE: " + query.toString());
if (sortField == null && !reverse) {
// no hit limit
Index: src/java/org/apache/nutch/searcher/NutchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/NutchBean.java (revision 414852)
+++ src/java/org/apache/nutch/searcher/NutchBean.java (working copy)
@@ -134,9 +134,11 @@
}
LOG.info("opening segments in " + segmentsDir);
- FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf);
-
+ FetchedSegments segments =
+ new FetchedSegments(this.fs, segmentsDir.toString(), this.conf);
+ System.out.println("SEGMENTS " + segments);
this.segmentNames = segments.getSegmentNames();
+ System.out.println("SEGMENTS " + this.segmentNames);
this.searcher = indexSearcher;
this.detailer = indexSearcher;
Index: src/java/org/apache/nutch/searcher/IndexSearcher.java
===================================================================
--- src/java/org/apache/nutch/searcher/IndexSearcher.java (revision 414852)
+++ src/java/org/apache/nutch/searcher/IndexSearcher.java (working copy)
@@ -57,6 +57,7 @@
this.conf = conf;
this.fs = FileSystem.get(conf);
for (int i = 0; i < indexDirs.length; i++) {
+ System.out.println("READER " + indexDirs[i]);
readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
}
init(new MultiReader(readers), conf);
@@ -92,6 +93,7 @@
throws IOException {
org.apache.lucene.search.BooleanQuery luceneQuery =
this.queryFilters.filter(query);
+ System.out.println("QUERY: " + luceneQuery.toString());
return translateHits
(optimizer.optimize(luceneQuery, luceneSearcher, numHits,
sortField, reverse),
Index: src/java/org/apache/nutch/searcher/OpenSearchServlet.java
===================================================================
--- src/java/org/apache/nutch/searcher/OpenSearchServlet.java (revision 414852)
+++ src/java/org/apache/nutch/searcher/OpenSearchServlet.java (working copy)
@@ -262,23 +262,64 @@
private static void addNode(Document doc, Node parent,
String name, String text) {
Element child = doc.createElement(name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
private static void addNode(Document doc, Node parent,
String ns, String name, String text) {
Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
private static void addAttribute(Document doc, Element node,
String name, String value) {
Attr attribute = doc.createAttribute(name);
- attribute.setValue(value);
+ attribute.setValue(getLegalXml(getLegalXml(value)));
node.getAttributes().setNamedItem(attribute);
}
+ /*
+ * Ensure string is legal xml.
+ * @param text String to verify.
+ * @return Passed text or a new string with illegal
+ * characters removed if any found in text.
+ * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
+ */
+ private static String getLegalXml(final String text) {
+ if (text == null) {
+ return null;
+ }
+ StringBuffer buffer = null;
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ if (!isLegalXml(c)) {
+ if (buffer == null) {
+ // Start up a buffer. Copy characters here from now on
+ // now we've found at least one bad character in original.
+ buffer = new StringBuffer(text.length());
+ buffer.append(text.substring(0, i));
+ }
+ } else {
+ if (buffer != null) {
+ buffer.append(c);
+ }
+ }
+ }
+ return (buffer != null)? buffer.toString(): text;
+ }
+
+ private static boolean isLegalXml(final char c) {
+ return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
+ || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
+ }
+
+ public static void main(final String [] args) {
+ // Test that our isLegalXml works.
+ System.out.println(getLegalXml("hello"));
+ System.out.println(getLegalXml("he\u0000llo"));
+ System.out.println(getLegalXml("\u0000he\u0000llo"));
+ System.out.println(getLegalXml("\u0000he\u0000llo\u0000"));
+ }
}
-
Index: src/java/org/apache/nutch/searcher/QueryFilters.java
===================================================================
--- src/java/org/apache/nutch/searcher/QueryFilters.java (revision 414852)
+++ src/java/org/apache/nutch/searcher/QueryFilters.java (working copy)
@@ -103,6 +103,7 @@
// then run each plugin
BooleanQuery output = new BooleanQuery();
for (int i = 0; i < this.queryFilters.length; i++) {
+ System.out.println("FILTER: " + this.queryFilters[i]);
output = this.queryFilters[i].filter(input, output);
}
return output;
Index: src/java/org/apache/nutch/parse/ParserFactory.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserFactory.java (revision 414852)
+++ src/java/org/apache/nutch/parse/ParserFactory.java (working copy)
@@ -357,6 +357,9 @@
}
private boolean match(Extension extension, String id, String type) {
+ System.out.println("EXT " + extension.getId() + " " +
+ extension.getAttribute("contentType"));
+
return ((id.equals(extension.getId())) &&
(type.equals(extension.getAttribute("contentType")) ||
type.equals(DEFAULT_PLUGIN)));
@@ -364,7 +367,9 @@
/** Get an extension from its id and supported content-type. */
private Extension getExtension(Extension[] list, String id, String type) {
+ System.out.println("ID " + id + " TYPE " + type);
for (int i=0; i