diff --git a/lucene/src/java/org/apache/lucene/util/IOUtils.java b/lucene/src/java/org/apache/lucene/util/IOUtils.java index 73d9dc6..37bbedd 100644 --- a/lucene/src/java/org/apache/lucene/util/IOUtils.java +++ b/lucene/src/java/org/apache/lucene/util/IOUtils.java @@ -17,15 +17,24 @@ package org.apache.lucene.util; * limitations under the License. */ +import java.io.BufferedReader; import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.lang.reflect.Method; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; /** This class emulates the new Java 7 "Try-With-Resources" statement. * Remove once Lucene is on Java 7. * @lucene.internal */ public final class IOUtils { - + public static final String CHARSET_UTF8 = "UTF-8"; private IOUtils() {} // no instance /** @@ -220,5 +229,48 @@ public final class IOUtils { } } } + + // nocommit add javadoc + public static Reader getReader(InputStream stream, String charSet) { + final CharsetDecoder charSetDecoder = Charset.forName(charSet).newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + return new BufferedReader(new InputStreamReader(stream, charSetDecoder)); + } + + // nocommit add javadoc + public static Reader getReader(File file, String charSet) throws IOException { + FileInputStream stream = null; + boolean success = false; + try { + stream = new FileInputStream(file); + final Reader reader = getReader(stream, charSet); + success = true; + return reader; + + } finally { + if (!success) { + IOUtils.close(stream); + } + } + } + // nocommit add javadoc + public static Reader getReader(Class clazz, String resource, String charSet) throws IOException { + InputStream stream = null; + boolean success = false; + try { + stream = clazz + .getResourceAsStream(resource); + final Reader reader = getReader(stream, charSet); + success = true; + return reader; + + } finally { + if (!success) { + IOUtils.close(stream); + } + } + } + } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 2ba5315..76f611a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -64,9 +65,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet( - Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class, - DEFAULT_STOPWORD_FILE, "#"), false)); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getReader(BrazilianAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), "#", Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java index 75fb8c4..f83f3a7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java @@ -57,8 +57,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { "they", "this", "to", "was", "will", "with" ); final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, - stopWords.size(), false); - stopSet.addAll(stopWords); + stopWords, false); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } @@ -82,7 +81,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { * @param matchVersion See above * @param stopwordsFile File to load stop words from */ public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwordsFile)); + this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion)); } /** Builds an analyzer with the stop words from the given reader. @@ -90,7 +89,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { * @param matchVersion See above * @param stopwords Reader to load stop words from */ public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index 0df03a1..893d84e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.*; @@ -70,9 +71,8 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet( - Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class, - DEFAULT_STOPWORD_FILE, "#"), false)); + DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getReader(CzechAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), "#", Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java index 65505dc..190aa79 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.DanishStemmer; @@ -62,8 +63,8 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 2c69900..d37d276 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.German2Stemmer; @@ -100,8 +101,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase { private static final Set DEFAULT_SET; static { try { - DEFAULT_SET = - WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); + DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java index 025415d..72745e4 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.SpanishStemmer; @@ -62,8 +63,8 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java index 85a0e59..5a4707a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.FinnishStemmer; @@ -62,8 +63,8 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 087f6a1..63c936e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.IOException; @@ -118,8 +119,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { static final Set DEFAULT_STOP_SET; static { try { - DEFAULT_STOP_SET = - WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java index 60dc7c3..18d2053 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java @@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -60,12 +62,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) - throw new RuntimeException("Unable to load default stopword set"); + throw new RuntimeException("Unable to load default stopword set", ex); } } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java index be3a879..ac016e2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.HungarianStemmer; @@ -62,8 +63,8 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java index 22790bb..499ec75 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java @@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.ItalianStemmer; @@ -79,8 +80,8 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java index d0ff1e1..88bb781 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java @@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -60,8 +62,8 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index 3931fa1..58a8b0e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.File; @@ -83,8 +84,8 @@ public final class DutchAnalyzer extends Analyzer { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java index ecb66f6..30adfdc 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.NorwegianStemmer; @@ -62,8 +63,8 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java index 3d28933..fd390fc 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.PortugueseStemmer; @@ -62,8 +63,8 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index 6ddf665..c0be88e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -84,12 +85,12 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase static { try { - DEFAULT_STOP_SET = - WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) - throw new RuntimeException("Unable to load default stopword set"); + throw new RuntimeException("Unable to load default stopword set", ex); } } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java index 9c9821d..dc3f0a6 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.File; @@ -85,7 +86,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords File to read stop words from */ public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** Builds an analyzer with the stop words from the given reader. @@ -94,7 +95,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords Reader to read stop words from */ public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index cf0011d..96b7e8c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.File; @@ -86,7 +87,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords File to read stop words from */ public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** Builds an analyzer with the stop words from the given reader. @@ -95,7 +96,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords Reader to read stop words from */ public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java index 7dd1702..3e20534 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.SwedishStemmer; @@ -62,8 +63,8 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java index c99dc54..3b6c89e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java @@ -17,10 +17,15 @@ package org.apache.lucene.analysis.util; +import java.io.BufferedReader; +import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -93,11 +98,35 @@ public abstract class StopwordAnalyzerBase extends Analyzer { protected static CharArraySet loadStopwordSet(final boolean ignoreCase, final Class aClass, final String resource, final String comment) throws IOException { - final Set wordSet = WordlistLoader.getWordSet(aClass, resource, - comment); - final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase); - set.addAll(wordSet); - return set; + Reader reader = null; + try { + reader = IOUtils.getReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF8); + return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase)); + } finally { + IOUtils.close(reader); + } + + } + + //nocommit javadoc + protected static CharArraySet loadStopwordSet(File stopwords, + Version matchVersion) throws IOException { + Reader reader = null; + try { + reader = IOUtils.getReader(stopwords, IOUtils.CHARSET_UTF8); + return WordlistLoader.getWordSet(reader, matchVersion); + } finally { + IOUtils.close(reader); + } + } + + //nocommit javadoc + protected static CharArraySet loadStopwordSet(Reader stopwords, + Version matchVersion) throws IOException { + try { + return WordlistLoader.getWordSet(stopwords, matchVersion); + } finally { + IOUtils.close(stopwords); + } } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java index 78aa03d..b106b29 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java @@ -18,165 +18,90 @@ package org.apache.lucene.analysis.util; */ import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; import java.io.IOException; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; + +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.Version; /** * Loader for text files that represent a list of stopwords. + * + * @see IOUtils to obtain {@link Reader} instances */ public class WordlistLoader { - - /** - * Loads a text file associated with a given class (See - * {@link Class#getResourceAsStream(String)}) and adds every line as an entry - * to a {@link Set} (omitting leading and trailing whitespace). Every line of - * the file should contain only one word. The words need to be in lower-case if - * you make use of an Analyzer which uses LowerCaseFilter (like - * StandardAnalyzer). - * - * @param aClass - * a class that is associated with the given stopwordResource - * @param stopwordResource - * name of the resource file associated with the given class - * @return a {@link Set} with the file's words - */ - public static Set getWordSet(Class aClass, String stopwordResource) - throws IOException { - final Reader reader = new BufferedReader(new InputStreamReader(aClass - .getResourceAsStream(stopwordResource), "UTF-8")); - try { - return getWordSet(reader); - } finally { - reader.close(); - } - } - /** - * Loads a text file associated with a given class (See - * {@link Class#getResourceAsStream(String)}) and adds every line as an entry - * to a {@link Set} (omitting leading and trailing whitespace). Every line of - * the file should contain only one word. The words need to be in lower-case if - * you make use of an Analyzer which uses LowerCaseFilter (like - * StandardAnalyzer). - * - * @param aClass - * a class that is associated with the given stopwordResource - * @param stopwordResource - * name of the resource file associated with the given class - * @param comment - * the comment string to ignore - * @return a {@link Set} with the file's words - */ - public static Set getWordSet(Class aClass, - String stopwordResource, String comment) throws IOException { - final Reader reader = new BufferedReader(new InputStreamReader(aClass - .getResourceAsStream(stopwordResource), "UTF-8")); - try { - return getWordSet(reader, comment); - } finally { - reader.close(); - } - } + private static final int INITITAL_CAPACITY = 16; /** - * Loads a text file and adds every line as an entry to a HashSet (omitting - * leading and trailing whitespace). Every line of the file should contain only + * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting + * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * - * @param wordfile File containing the wordlist - * @return A HashSet with the file's words + * @param reader Reader containing the wordlist + * @param result the {@link CharArraySet} to fill with the readers words + * @return the given {@link CharArraySet} with the reader's words */ - public static HashSet getWordSet(File wordfile) throws IOException { - FileReader reader = null; + public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException { + BufferedReader br = null; try { - reader = new FileReader(wordfile); - return getWordSet(reader); + br = getBufferedReader(reader); + String word = null; + while ((word = br.readLine()) != null) { + result.add(word.trim()); + } } finally { - if (reader != null) - reader.close(); + IOUtils.close(br); } + return result; } - + /** - * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting - * leading and trailing whitespace). Every line of the file should contain only + * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting + * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * - * @param wordfile File containing the wordlist - * @param comment The comment string to ignore - * @return A HashSet with the file's words + * @param reader Reader containing the wordlist + * @param matchVersion the Lucene {@link Version} + * @return A {@link CharArraySet} with the reader's words */ - public static HashSet getWordSet(File wordfile, String comment) throws IOException { - FileReader reader = null; - try { - reader = new FileReader(wordfile); - return getWordSet(reader, comment); - } - finally { - if (reader != null) - reader.close(); - } + public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException { + return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false)); } - /** - * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting + * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist - * @return A HashSet with the reader's words + * @param comment The string representing a comment. + * @param matchVersion the Lucene {@link Version} + * @return A CharArraySet with the reader's words */ - public static HashSet getWordSet(Reader reader) throws IOException { - final HashSet result = new HashSet(); - BufferedReader br = null; - try { - if (reader instanceof BufferedReader) { - br = (BufferedReader) reader; - } else { - br = new BufferedReader(reader); - } - String word = null; - while ((word = br.readLine()) != null) { - result.add(word.trim()); - } - } - finally { - if (br != null) - br.close(); - } - return result; + public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException { + return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false)); } /** - * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting + * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. - * @return A HashSet with the reader's words + * @param result the {@link CharArraySet} to fill with the readers words + * @return the given {@link CharArraySet} with the reader's words */ - public static HashSet getWordSet(Reader reader, String comment) throws IOException { - final HashSet result = new HashSet(); + public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException { BufferedReader br = null; try { - if (reader instanceof BufferedReader) { - br = (BufferedReader) reader; - } else { - br = new BufferedReader(reader); - } + br = getBufferedReader(reader); String word = null; while ((word = br.readLine()) != null) { if (word.startsWith(comment) == false){ @@ -185,34 +110,11 @@ public class WordlistLoader { } } finally { - if (br != null) - br.close(); + IOUtils.close(br); } return result; } - /** - * Loads a text file in Snowball format associated with a given class (See - * {@link Class#getResourceAsStream(String)}) and adds all words as entries to - * a {@link Set}. The words need to be in lower-case if you make use of an - * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - * - * @param aClass a class that is associated with the given stopwordResource - * @param stopwordResource name of the resource file associated with the given - * class - * @return a {@link Set} with the file's words - * @see #getSnowballWordSet(Reader) - */ - public static Set getSnowballWordSet(Class aClass, - String stopwordResource) throws IOException { - final Reader reader = new BufferedReader(new InputStreamReader(aClass - .getResourceAsStream(stopwordResource), "UTF-8")); - try { - return getSnowballWordSet(reader); - } finally { - reader.close(); - } - } /** * Reads stopwords from a stopword list in Snowball format. @@ -226,18 +128,14 @@ public class WordlistLoader { *

* * @param reader Reader containing a Snowball stopword list - * @return A Set with the reader's words + * @param result the {@link CharArraySet} to fill with the readers words + * @return the given {@link CharArraySet} with the reader's words */ - public static Set getSnowballWordSet(Reader reader) + public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result) throws IOException { - final Set result = new HashSet(); BufferedReader br = null; try { - if (reader instanceof BufferedReader) { - br = (BufferedReader) reader; - } else { - br = new BufferedReader(reader); - } + br = getBufferedReader(reader); String line = null; while ((line = br.readLine()) != null) { int comment = line.indexOf('|'); @@ -247,10 +145,30 @@ public class WordlistLoader { if (words[i].length() > 0) result.add(words[i]); } } finally { - if (br != null) br.close(); + IOUtils.close(br); } return result; } + + /** + * Reads stopwords from a stopword list in Snowball format. + *

+ * The snowball format is the following: + *

    + *
  • Lines may contain multiple words separated by whitespace. + *
  • The comment character is the vertical line (|). + *
  • Lines may contain trailing comments. + *
+ *

+ * + * @param reader Reader containing a Snowball stopword list + * @param result the {@link CharArraySet} to fill with the readers words + * @param matchVersion the Lucene {@link Version} + * @return A {@link CharArraySet} with the reader's words + */ + public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException { + return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false)); + } /** @@ -261,24 +179,24 @@ public class WordlistLoader { * @return stem dictionary that overrules the stemming algorithm * @throws IOException */ - public static HashMap getStemDict(File wordstemfile) throws IOException { - if (wordstemfile == null) - throw new NullPointerException("wordstemfile may not be null"); - final HashMap result = new HashMap(); + public static CharArrayMap getStemDict(Reader reader, CharArrayMap result) throws IOException { BufferedReader br = null; - try { - br = new BufferedReader(new FileReader(wordstemfile)); + br = getBufferedReader(reader); String line; while ((line = br.readLine()) != null) { String[] wordstem = line.split("\t", 2); result.put(wordstem[0], wordstem[1]); } } finally { - if(br != null) - br.close(); + IOUtils.close(br); } return result; } - + + private static BufferedReader getBufferedReader(Reader reader) { + return (reader instanceof BufferedReader) ? (BufferedReader) reader + : new BufferedReader(reader); + } + } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java index 8983ead..9cb0757 100755 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java @@ -46,7 +46,7 @@ public class TestCharArraySet extends LuceneTestCase { public void testNonZeroOffset() { String[] words={"Hello","World","this","is","a","test"}; char[] findme="xthisy".toCharArray(); - CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true); + CharArraySet set= new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.addAll(Arrays.asList(words)); assertTrue(set.contains(findme, 1, 4)); assertTrue(set.contains(new String(findme,1,4))); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java index 74356c4..a9634f6 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java @@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util; import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.util.LuceneTestCase; @@ -31,22 +29,22 @@ public class TestWordlistLoader extends LuceneTestCase { public void testWordlistLoading() throws IOException { String s = "ONE\n two \nthree"; - HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s)); + CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT); checkSet(wordSet1); - HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s))); + CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT); checkSet(wordSet2); } public void testComments() throws Exception { String s = "ONE\n two \nthree\n#comment"; - HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#"); + CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT); checkSet(wordSet1); assertFalse(wordSet1.contains("#comment")); assertFalse(wordSet1.contains("comment")); } - private void checkSet(HashSet wordset) { + private void checkSet(CharArraySet wordset) { assertEquals(3, wordset.size()); assertTrue(wordset.contains("ONE")); // case is not modified assertTrue(wordset.contains("two")); // surrounding whitespace is removed @@ -68,7 +66,7 @@ public class TestWordlistLoader extends LuceneTestCase { " two \n" + // stopword with leading/trailing space " three four five \n" + // multiple stopwords "six seven | comment\n"; //multiple stopwords + comment - Set wordset = WordlistLoader.getSnowballWordSet(new StringReader(s)); + CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT); assertEquals(7, wordset.size()); assertTrue(wordset.contains("ONE")); assertTrue(wordset.contains("two")); diff --git a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java index f078b6a..8de8b34 100644 --- a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java +++ b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java @@ -26,6 +26,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -66,7 +67,7 @@ public final class SmartChineseAnalyzer extends Analyzer { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -75,7 +76,7 @@ public final class SmartChineseAnalyzer extends Analyzer { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -87,13 +88,14 @@ public final class SmartChineseAnalyzer extends Analyzer { } } - static Set loadDefaultStopWordSet() throws IOException { + static CharArraySet loadDefaultStopWordSet() throws IOException { InputStream stream = SmartChineseAnalyzer.class .getResourceAsStream(DEFAULT_STOPWORD_FILE); try { InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT)); + return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, + STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT)); } finally { stream.close(); } diff --git a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java index 8dc589a..4266e11 100644 --- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java +++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.stempel.StempelFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.egothor.stemmer.Trie; @@ -68,8 +69,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getReader(PolishAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF8), "#", Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR)