Index: solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java (revision 1336462) +++ solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.core.TypeTokenFilter; import org.apache.lucene.analysis.util.InitializationException; import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.util.StrUtils; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; @@ -54,7 +55,7 @@ if (files.size() > 0) { stopTypes = new HashSet(); for (String file : files) { - List typesLines = loader.getLines(file.trim()); + List typesLines = getWordList(loader, file.trim(), IOUtils.CHARSET_UTF_8); stopTypes.addAll(typesLines); } } Index: solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java (revision 1336462) +++ solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java (working copy) @@ -1,4 +1,6 @@ -/** +package org.apache.solr.analysis; + +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -15,8 +17,6 @@ * limitations under the License. */ -package org.apache.solr.analysis; - import java.io.File; import java.io.IOException; import java.util.ArrayList; @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.util.StrUtils; /** @@ -58,13 +59,13 @@ try{ File mappingFile = new File( mapping ); if( mappingFile.exists() ){ - wlist = loader.getLines( mapping ); + wlist = getWordList(loader, mapping, IOUtils.CHARSET_UTF_8); } else{ List files = StrUtils.splitFileNames( mapping ); wlist = new ArrayList(); for( String file : files ){ - List lines = loader.getLines( file.trim() ); + List lines = getWordList(loader, file.trim(), IOUtils.CHARSET_UTF_8); wlist.addAll( lines ); } } Index: solr/core/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (revision 1336462) +++ solr/core/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (working copy) @@ -1,4 +1,6 @@ -/** +package org.apache.solr.analysis; + +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -15,13 +17,12 @@ * limitations under the License. */ -package org.apache.solr.analysis; - import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.util.StrUtils; import java.util.ArrayList; @@ -70,7 +71,7 @@ List files = StrUtils.splitFileNames( types ); List wlist = new ArrayList(); for( String file : files ){ - List lines = loader.getLines( file.trim() ); + List lines = getWordList(loader, file.trim(), IOUtils.CHARSET_UTF_8); wlist.addAll( lines ); } typeTable = parseTypes(wlist); Index: solr/core/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java (revision 1336462) +++ solr/core/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java (working copy) @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.util.StrUtils; /** @@ -51,7 +52,7 @@ dictionary = new CharArrayMap(luceneMatchVersion, files.size() * 10, ignoreCase); for (String file : files) { - List list = loader.getLines(file.trim()); + List list = getWordList(loader, file.trim(), IOUtils.CHARSET_UTF_8); for (String line : list) { String[] mapping = line.split("\t", 2); dictionary.put(mapping[0], mapping[1]); Index: solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java (revision 1336462) +++ solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.util.StrUtils; import java.io.File; @@ -71,12 +72,12 @@ try { File synonymFile = new File(synonyms); if (synonymFile.exists()) { - wlist = loader.getLines(synonyms); + wlist = getWordList(loader, synonyms, IOUtils.CHARSET_UTF_8); } else { List files = StrUtils.splitFileNames(synonyms); wlist = new ArrayList(); for (String file : files) { - List lines = loader.getLines(file.trim()); + List lines = getWordList(loader, file.trim(), IOUtils.CHARSET_UTF_8); wlist.addAll(lines); } } Index: solr/core/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java =================================================================== --- solr/core/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java (revision 1336462) +++ solr/core/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java (working copy) @@ -17,10 +17,16 @@ package org.apache.solr.spelling; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; import java.util.List; +import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.index.*; +import org.apache.lucene.util.IOUtils; +import org.apache.solr.common.SolrException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -101,7 +107,19 @@ // .setCodec(core.getCodec()) ); - List lines = core.getResourceLoader().getLines(sourceLocation, characterEncoding); + List lines = null; + InputStream inputStream = null; + try { + // nocommit - This is a lot of boilerplate, need to put it somewhere. + inputStream = core.getResourceLoader().openResource(sourceLocation); + Reader reader = IOUtils.getDecodingReader(inputStream, Charset.forName(characterEncoding)); + lines = WordlistLoader.getWordList(reader, "#"); + } catch (IOException ioe) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "IOException thrown while reading source location", ioe); + } finally { + IOUtils.closeWhileHandlingException(inputStream); + } for (String s : lines) { Document d = new Document(); Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java (revision 1336462) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java (working copy) @@ -20,6 +20,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; +import java.util.List; import org.apache.lucene.util.LuceneTestCase; @@ -76,4 +77,14 @@ assertTrue(wordset.contains("six")); assertTrue(wordset.contains("seven")); } + + public void testGetWordList() throws IOException { + String testString = "\uFEFFBOMsareevil"; + List wordList = WordlistLoader.getWordList(new StringReader(testString), "#"); + assertEquals("BOMsareevil", wordList.get(0)); + + testString = "# Commented Line\nUncommented line"; + wordList = WordlistLoader.getWordList(new StringReader(testString), "#"); + assertEquals("Uncommented line", wordList.get(0)); + } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java (revision 1336462) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java (working copy) @@ -17,16 +17,11 @@ * limitations under the License. */ -import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; +import java.io.*; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -107,17 +102,16 @@ assureMatchVersion(); List files = splitFileNames(wordFiles); CharArraySet words = null; - if (files.size() > 0) { - // default stopwords list has 35 or so words, but maybe don't make it that - // big to start - words = new CharArraySet(luceneMatchVersion, - files.size() * 10, ignoreCase); - for (String file : files) { - List wlist = loader.getLines(file.trim()); - words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist, - ignoreCase)); - } + + if (files.isEmpty()) { + return words; } + + // default stopwords list has 35 or so words, but maybe don't make it that big to start + words = new CharArraySet(luceneMatchVersion, files.size() * 10, ignoreCase); + for (String file : files) { + words.addAll(getWordList(loader, file.trim(), IOUtils.CHARSET_UTF_8)); + } return words; } @@ -128,29 +122,41 @@ assureMatchVersion(); List files = splitFileNames(wordFiles); CharArraySet words = null; - if (files.size() > 0) { - // default stopwords list has 35 or so words, but maybe don't make it that - // big to start - words = new CharArraySet(luceneMatchVersion, - files.size() * 10, ignoreCase); - for (String file : files) { - InputStream stream = null; - Reader reader = null; - try { - stream = loader.openResource(file.trim()); - CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - reader = new InputStreamReader(stream, decoder); - WordlistLoader.getSnowballWordSet(reader, words); - } finally { - IOUtils.closeWhileHandlingException(reader, stream); - } + + if (files.isEmpty()) { + return words; + } + + // default stopwords list has 35 or so words, but maybe don't make it that big to start + words = new CharArraySet(luceneMatchVersion, files.size() * 10, ignoreCase); + for (String file : files) { + InputStream stream = null; + Reader reader = null; + try { + stream = loader.openResource(file.trim()); + reader = IOUtils.getDecodingReader(stream, IOUtils.CHARSET_UTF_8); + WordlistLoader.getSnowballWordSet(reader, words); + } finally { + IOUtils.closeWhileHandlingException(reader, stream); } } return words; } + // nocommit - This is useful boilerplate, need to put it somewhere more accessible + protected List getWordList(ResourceLoader loader, String resource, Charset charset) throws IOException { + InputStream inputStream = null; + try { + inputStream = loader.openResource(resource); + if (inputStream == null) { + return Collections.emptyList(); + } + return WordlistLoader.getWordList(IOUtils.getDecodingReader(inputStream, charset), "#"); + } finally { + IOUtils.close(inputStream); + } + } + /** * Splits file names separated by comma character. * File names can contain comma characters escaped by backslash '\' @@ -159,8 +165,9 @@ * @return a list of file names with the escaping backslashed removed */ protected List splitFileNames(String fileNames) { - if (fileNames == null) + if (fileNames == null) { return Collections.emptyList(); + } List result = new ArrayList(); for (String file : fileNames.split("(? getWordList(Reader reader, String comment) throws IOException { + List list = new ArrayList(); + BufferedReader bufferedReader = null; + try { + bufferedReader = getBufferedReader(reader); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (list.isEmpty() && line.length() > 0 && line.charAt(0) == '\uFEFF') { + line = line.substring(1); + } + if (!line.startsWith(comment)) { + list.add(line.trim()); + } + } + } finally { + IOUtils.close(bufferedReader); + } + return list; + } private static BufferedReader getBufferedReader(Reader reader) { return (reader instanceof BufferedReader) ? (BufferedReader) reader