--- nutch-0.7/src/test/org/apache/nutch/util/TestStringUtil.java 2005-08-17 03:23:42.000000000 -0700
+++ nutch-0.7-patched/src/test/org/apache/nutch/util/TestStringUtil.java 2005-10-14 11:06:49.000000000 -0700
@@ -58,4 +58,10 @@
}
+ public void testToValidXmlText() {
+ String tmp = StringUtil.toValidXmlText("1 < 2");
+ assertEquals("Failed escape less-than: " + tmp, tmp, "1 < 2");
+ tmp = StringUtil.toValidXmlText("x\u000Cx");
+ assertEquals("Failed to drop illegal 0xC character: " + tmp, tmp, "xx");
+ }
}
--- nutch-0.7/src/java/org/apache/nutch/util/StringUtil.java 2005-08-17 03:23:42.000000000 -0700
+++ nutch-0.7-patched/src/java/org/apache/nutch/util/StringUtil.java 2005-10-14 10:37:18.000000000 -0700
@@ -109,6 +109,146 @@
(String) encodingAliases.get(canonicalName) : canonicalName;
}
+ /**
+ * Escapes a string so that it can be safely put into an XML text node.
+ * Please note that some characters cannot be serialized into an XML text
+ * (Such characters are dropped from the String returned). Refer to
+ * XML
+ * specification for more information.
+ *
+ * @param str The string to be escaped.
+ * IllegalArgumentException is thrown when an unescapable
+ * sequence of characters is encountered. Otherwise, the offending
+ * characters will be omitted in the output.
+ * @return A string that is safe to use in an XML element or attribute. The
+ * xml 5 'special characters' are entity encoded if present and characters
+ * outside of the legal range for xml documents will have been removed.
+ * @author Dawid Weiss
+ */
+ public static String toValidXmlText(final String str)
+ {
+ return toValidXmlText(str, false);
+ }
+
+ /**
+ * Escapes a string so that it can be safely put into an XML text node.
+ * Please note that some characters cannot be serialized into an XML text.
+ * Refer to XML
+ * specification for more information.
+ *
+ * @param str The string to be escaped.
+ * @param exceptionOnUnescapable If true,
+ * IllegalArgumentException is thrown when an unescapable
+ * sequence of characters is encountered. Otherwise, the offending
+ * characters will be omitted in the output.
+ * @return A string that is safe to use in an XML element or attribute. The
+ * xml 5 'special characters' are entity encoded if present and characters
+ * outside of the legal range for xml documents will have been removed
+ * (if exceptionOnUnescapable is true.
+ * @author Dawid Weiss
+ */
+ public static String toValidXmlText(final String str,
+ final boolean exceptionOnUnescapable)
+ {
+ StringBuffer buffer = null;
+
+ for (int i = 0; i < str.length(); i++)
+ {
+ char ch = str.charAt(i);
+ String entity;
+
+ switch (ch)
+ {
+ case '<': // '<'
+ entity = "<";
+
+ break;
+
+ case '>': // '>'
+ entity = ">";
+
+ break;
+
+ case '&': // '&'
+ entity = "&";
+
+ break;
+
+ case '\'':
+ entity = "'";
+
+ break;
+
+ case '"':
+ entity = """;
+
+ break;
+
+ case 0x09: // valid xml characters
+ case 0x0a:
+ case 0x0d:
+ entity = null;
+
+ break;
+
+ default:
+
+ // check if valid XML characters
+ if (
+ ((ch >= 0x20) && (ch <= 0xD7FF)) ||
+ ((ch >= 0xe000) && (ch <= 0xfffd)) ||
+ ((ch >= 0x10000) && (ch <= 0x10ffff))
+ )
+ {
+ entity = null;
+
+ break;
+ }
+ else
+ {
+ if (exceptionOnUnescapable)
+ {
+ throw new IllegalArgumentException(
+ "Character is not within valid XML characters " +
+ "(code: 0x" + Integer.toHexString(ch) +
+ ", position: " + i + ")."
+ );
+ }
+ else
+ {
+ // replace the character with an empty string.
+ entity = "";
+
+ break;
+ }
+ }
+ }
+
+ if (buffer == null)
+ {
+ if (entity != null)
+ {
+ buffer = new StringBuffer(str.length() + 20);
+ buffer.append(str.substring(0, i));
+ buffer.append(entity);
+ }
+ }
+ else
+ {
+ if (entity == null)
+ {
+ buffer.append(ch);
+ }
+ else
+ {
+ buffer.append(entity);
+ }
+ }
+ }
+
+ return (buffer != null) ? buffer.toString() : str;
+ }
+
public static void main(String[] args) {
if (args.length != 1)
System.out.println("Usage: StringUtil ");
--- nutch-0.7/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2005-08-17 03:23:42.000000000 -0700
+++ nutch-0.7-patched/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2005-10-14 11:04:28.000000000 -0700
@@ -15,6 +15,7 @@
*/
package org.apache.nutch.searcher;
+import org.apache.nutch.util.StringUtil;
import java.io.IOException;
import java.net.URLEncoder;
@@ -252,14 +253,14 @@
private static void addNode(Document doc, Node parent,
String name, String text) {
Element child = doc.createElement(name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(StringUtil.toValidXmlText(text)));
parent.appendChild(child);
}
private static void addNode(Document doc, Node parent,
String ns, String name, String text) {
Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(StringUtil.toValidXmlText(text)));
parent.appendChild(child);
}
@@ -269,6 +270,5 @@
attribute.setValue(value);
node.getAttributes().setNamedItem(attribute);
}
-
}