--- nutch-0.7/src/test/org/apache/nutch/util/TestStringUtil.java 2005-08-17 03:23:42.000000000 -0700 +++ nutch-0.7-patched/src/test/org/apache/nutch/util/TestStringUtil.java 2005-10-14 11:06:49.000000000 -0700 @@ -58,4 +58,10 @@ } + public void testToValidXmlText() { + String tmp = StringUtil.toValidXmlText("1 < 2"); + assertEquals("Failed escape less-than: " + tmp, tmp, "1 < 2"); + tmp = StringUtil.toValidXmlText("x\u000Cx"); + assertEquals("Failed to drop illegal 0xC character: " + tmp, tmp, "xx"); + } } --- nutch-0.7/src/java/org/apache/nutch/util/StringUtil.java 2005-08-17 03:23:42.000000000 -0700 +++ nutch-0.7-patched/src/java/org/apache/nutch/util/StringUtil.java 2005-10-14 10:37:18.000000000 -0700 @@ -109,6 +109,146 @@ (String) encodingAliases.get(canonicalName) : canonicalName; } + /** + * Escapes a string so that it can be safely put into an XML text node. + * Please note that some characters cannot be serialized into an XML text + * (Such characters are dropped from the String returned). Refer to + * XML + * specification for more information. + * + * @param str The string to be escaped. + * IllegalArgumentException is thrown when an unescapable + * sequence of characters is encountered. Otherwise, the offending + * characters will be omitted in the output. + * @return A string that is safe to use in an XML element or attribute. The + * xml 5 'special characters' are entity encoded if present and characters + * outside of the legal range for xml documents will have been removed. + * @author Dawid Weiss + */ + public static String toValidXmlText(final String str) + { + return toValidXmlText(str, false); + } + + /** + * Escapes a string so that it can be safely put into an XML text node. + * Please note that some characters cannot be serialized into an XML text. + * Refer to XML + * specification for more information. + * + * @param str The string to be escaped. + * @param exceptionOnUnescapable If true, + * IllegalArgumentException is thrown when an unescapable + * sequence of characters is encountered. Otherwise, the offending + * characters will be omitted in the output. + * @return A string that is safe to use in an XML element or attribute. The + * xml 5 'special characters' are entity encoded if present and characters + * outside of the legal range for xml documents will have been removed + * (if exceptionOnUnescapable is true. + * @author Dawid Weiss + */ + public static String toValidXmlText(final String str, + final boolean exceptionOnUnescapable) + { + StringBuffer buffer = null; + + for (int i = 0; i < str.length(); i++) + { + char ch = str.charAt(i); + String entity; + + switch (ch) + { + case '<': // '<' + entity = "<"; + + break; + + case '>': // '>' + entity = ">"; + + break; + + case '&': // '&' + entity = "&"; + + break; + + case '\'': + entity = "'"; + + break; + + case '"': + entity = """; + + break; + + case 0x09: // valid xml characters + case 0x0a: + case 0x0d: + entity = null; + + break; + + default: + + // check if valid XML characters + if ( + ((ch >= 0x20) && (ch <= 0xD7FF)) || + ((ch >= 0xe000) && (ch <= 0xfffd)) || + ((ch >= 0x10000) && (ch <= 0x10ffff)) + ) + { + entity = null; + + break; + } + else + { + if (exceptionOnUnescapable) + { + throw new IllegalArgumentException( + "Character is not within valid XML characters " + + "(code: 0x" + Integer.toHexString(ch) + + ", position: " + i + ")." + ); + } + else + { + // replace the character with an empty string. + entity = ""; + + break; + } + } + } + + if (buffer == null) + { + if (entity != null) + { + buffer = new StringBuffer(str.length() + 20); + buffer.append(str.substring(0, i)); + buffer.append(entity); + } + } + else + { + if (entity == null) + { + buffer.append(ch); + } + else + { + buffer.append(entity); + } + } + } + + return (buffer != null) ? buffer.toString() : str; + } + public static void main(String[] args) { if (args.length != 1) System.out.println("Usage: StringUtil "); --- nutch-0.7/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2005-08-17 03:23:42.000000000 -0700 +++ nutch-0.7-patched/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2005-10-14 11:04:28.000000000 -0700 @@ -15,6 +15,7 @@ */ package org.apache.nutch.searcher; +import org.apache.nutch.util.StringUtil; import java.io.IOException; import java.net.URLEncoder; @@ -252,14 +253,14 @@ private static void addNode(Document doc, Node parent, String name, String text) { Element child = doc.createElement(name); - child.appendChild(doc.createTextNode(text)); + child.appendChild(doc.createTextNode(StringUtil.toValidXmlText(text))); parent.appendChild(child); } private static void addNode(Document doc, Node parent, String ns, String name, String text) { Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); - child.appendChild(doc.createTextNode(text)); + child.appendChild(doc.createTextNode(StringUtil.toValidXmlText(text))); parent.appendChild(child); } @@ -269,6 +270,5 @@ attribute.setValue(value); node.getAttributes().setNamedItem(attribute); } - }