Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 1029293) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.LinkedList; +import java.util.Locale; import java.util.Set; import org.apache.lucene.analysis.Token; @@ -224,7 +225,7 @@ protected static final void addAllLowerCase(CharArraySet target, Collection col) { for (Object obj : col) { String string = (String) obj; - target.add(string.toLowerCase()); + target.add(string.toLowerCase(Locale.ENGLISH)); } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd (revision 1029293) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd (working copy) @@ -1,68 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java (revision 1029293) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java (working copy) @@ -91,7 +91,7 @@ * @throws HyphenationException In case of an exception while parsing */ public void parse(String filename) throws HyphenationException { - parse(new File(filename)); + parse(new InputSource(filename)); } /** @@ -266,7 +266,15 @@ // @Override public InputSource resolveEntity(String publicId, String systemId) { - return HyphenationDTDGenerator.generateDTD(); + // supply the internal hyphenation.dtd if possible + if ( + (systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) || + ("hyphenation-info".equals(publicId)) + ) { + // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm()); + return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm()); + } + return null; } // @@ -373,36 +381,7 @@ } - // - // ErrorHandler methods - // - /** - * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException) - */ - @Override - public void warning(SAXParseException ex) { - errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage(); - } - - /** - * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException) - */ - @Override - public void error(SAXParseException ex) { - errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage(); - } - - /** - * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException) - */ - @Override - public void fatalError(SAXParseException ex) throws SAXException { - errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage(); - throw ex; - } - - /** * Returns a string of the location. */ private String getLocationString(SAXParseException ex) { @@ -446,79 +425,3 @@ } } } - -class HyphenationDTDGenerator { - public static final String DTD_STRING= - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"; - - public static InputSource generateDTD() { - return new InputSource(new StringReader(DTD_STRING)); - } -} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (revision 1029293) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (working copy) @@ -19,7 +19,6 @@ import java.io.File; import java.io.FileInputStream; -import java.io.InputStreamReader; import java.io.Reader; import java.util.Set; @@ -267,7 +266,7 @@ */ public static HyphenationTree getHyphenationTree(String hyphenationFilename) throws Exception { - return getHyphenationTree(new File(hyphenationFilename)); + return getHyphenationTree(new InputSource(hyphenationFilename)); } /** @@ -279,8 +278,7 @@ */ public static HyphenationTree getHyphenationTree(File hyphenationFile) throws Exception { - return getHyphenationTree(new InputStreamReader(new FileInputStream( - hyphenationFile), "ISO-8859-1")); + return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm())); } /** @@ -289,13 +287,32 @@ * @param hyphenationReader the reader of the XML grammar to load from * @return An object representing the hyphenation patterns * @throws Exception + * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created. + * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input + * stream, if you like. */ + @Deprecated public static HyphenationTree getHyphenationTree(Reader hyphenationReader) throws Exception { + final InputSource is = new InputSource(hyphenationReader); + // we need this to load the DTD in very old parsers (like the one in JDK 1.4). + // The DTD itsself is provided via EntityResolver, so it should always load, but + // some parsers still want to have a base URL (Crimson). + is.setSystemId("urn:java:" + HyphenationTree.class.getName()); + return getHyphenationTree(is); + } + + /** + * Create a hyphenator tree + * + * @param hyphenationSource the InputSource pointing to the XML grammar + * @return An object representing the hyphenation patterns + * @throws Exception + */ + public static HyphenationTree getHyphenationTree(InputSource hyphenationSource) + throws Exception { HyphenationTree tree = new HyphenationTree(); - - tree.loadPatterns(new InputSource(hyphenationReader)); - + tree.loadPatterns(hyphenationSource); return tree; } Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd =================================================================== --- modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd (revision 0) +++ modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd (working copy) @@ -14,7 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. --> - Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 1028922) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -17,9 +17,9 @@ * limitations under the License. */ -import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; +import org.xml.sax.InputSource; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; @@ -31,10 +31,9 @@ public void testHyphenationCompoundWordsDA() throws Exception { String[] dict = { "læse", "hest" }; - Reader reader = getHyphenationReader(); - + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter - .getHyphenationTree(reader); + .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader( @@ -50,10 +49,10 @@ public void testHyphenationCompoundWordsDELongestMatch() throws Exception { String[] dict = { "basketball", "basket", "ball", "kurv" }; - Reader reader = getHyphenationReader(); + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter - .getHyphenationTree(reader); + .getHyphenationTree(is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, @@ -73,9 +72,9 @@ * This can be controlled with the min/max subword size. */ public void testHyphenationOnly() throws Exception { - Reader reader = getHyphenationReader(); + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter - .getHyphenationTree(reader); + .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( TEST_VERSION_CURRENT, @@ -185,7 +184,4 @@ assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); } - private Reader getHyphenationReader() throws Exception { - return new InputStreamReader(getClass().getResourceAsStream("da_UTF8.xml"), "UTF-8"); - } } Index: solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java (revision 1028922) +++ solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java (working copy) @@ -17,10 +17,6 @@ package org.apache.solr.analysis; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; - import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; @@ -33,6 +29,8 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; import java.util.Map; +import java.io.InputStream; +import org.xml.sax.InputSource; /** * Factory for {@link HyphenationCompoundWordTokenFilter} @@ -57,7 +55,7 @@ private HyphenationTree hyphenator; private String dictFile; private String hypFile; - private String encoding = "UTF-8"; // default to UTF-8 encoding + private String encoding; private int minWordSize; private int minSubwordSize; private int maxSubwordSize; @@ -82,18 +80,21 @@ } public void inform(ResourceLoader loader) { - Reader reader = null; + InputStream stream = null; try { if (dictFile != null) // the dictionary can be empty. dictionary = getWordSet(loader, dictFile, false); - - InputStream hyph = loader.openResource(hypFile); - reader = new InputStreamReader(hyph, encoding); - hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader); - } catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception" + // TODO: Broken, because we cannot resolve real system id + // ResourceLoader should also supply method like ClassLoader to get resource URL + stream = loader.openResource(hypFile); + final InputSource is = new InputSource(stream); + is.setEncoding(encoding); // if it's null let xml parser decide + is.setSystemId(hypFile); + hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + } catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception" throw new RuntimeException(e); } finally { - IOUtils.closeQuietly(reader); + IOUtils.closeQuietly(stream); } }