diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index d0caa19..9d7d965 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -99,6 +99,11 @@ Changes in backwards compatibility policy * LUCENE-3558: Moved NRTManager & NRTManagerReopenThread into lucene core o.a.l.search. (Simon Willnauer) + + * LUCENE-2564: WordListLoader is now flaged as @lucene.internal. All methods in + WordListLoader now return CharArraySet/Map and expect Reader instances for + efficiency. Utilities to open Readers from Files, InputStreams or Java + resources were added to IOUtils. (Simon Willnauer, Robert Muir) New Features diff --git a/lucene/src/java/org/apache/lucene/util/IOUtils.java b/lucene/src/java/org/apache/lucene/util/IOUtils.java index 73d9dc6..8508c18 100644 --- a/lucene/src/java/org/apache/lucene/util/IOUtils.java +++ b/lucene/src/java/org/apache/lucene/util/IOUtils.java @@ -17,15 +17,35 @@ package org.apache.lucene.util; * limitations under the License. */ +import java.io.BufferedReader; import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.lang.reflect.Method; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; /** This class emulates the new Java 7 "Try-With-Resources" statement. * Remove once Lucene is on Java 7. * @lucene.internal */ public final class IOUtils { - + + /** + * UTF-8 charset string + * @see Charset#forName(String) + */ + public static final String UTF_8 = "UTF-8"; + + /** + * UTF-8 {@link Charset} instance to prevent repeated + * {@link Charset#forName(String)} lookups + */ + public static final Charset CHARSET_UTF_8 = Charset.forName("UTF-8"); private IOUtils() {} // no instance /** @@ -220,5 +240,84 @@ public final class IOUtils { } } } + + /** + * Wrapping the given {@link InputStream} in a reader using a {@link CharsetDecoder}. + * Unlike Java's defaults this reader will throw an exception if your it detects + * the read charset doesn't match the expected {@link Charset}. + *
+ * Decoding readers are useful to load configuration files, stopword lists or synonym files + * to detect character set problems. However, its not recommended to use as a common purpose + * reader. + * + * @param stream the stream to wrap in a reader + * @param charSet the expected charset + * @return a wrapping reader + */ + public static Reader getDecodingReader(InputStream stream, Charset charSet) { + final CharsetDecoder charSetDecoder = charSet.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + return new BufferedReader(new InputStreamReader(stream, charSetDecoder)); + } + + /** + * Opens a Reader for the given {@link File} using a {@link CharsetDecoder}. + * Unlike Java's defaults this reader will throw an exception if your it detects + * the read charset doesn't match the expected {@link Charset}. + *
+ * Decoding readers are useful to load configuration files, stopword lists or synonym files + * to detect character set problems. However, its not recommended to use as a common purpose + * reader. + * @param file the file to open a reader on + * @param charSet the expected charset + * @return a reader to read the given file + */ + public static Reader getDecodingReader(File file, Charset charSet) throws IOException { + FileInputStream stream = null; + boolean success = false; + try { + stream = new FileInputStream(file); + final Reader reader = getDecodingReader(stream, charSet); + success = true; + return reader; + + } finally { + if (!success) { + IOUtils.close(stream); + } + } + } + + /** + * Opens a Reader for the given resource using a {@link CharsetDecoder}. + * Unlike Java's defaults this reader will throw an exception if your it detects + * the read charset doesn't match the expected {@link Charset}. + *
+ * Decoding readers are useful to load configuration files, stopword lists or synonym files
+ * to detect character set problems. However, its not recommended to use as a common purpose
+ * reader.
+ * @param clazz the class used to locate the resource
+ * @param resource the resource name to load
+ * @param charSet the expected charset
+ * @return a reader to read the given file
+ *
+ */
+ public static Reader getDecodingReader(Class> clazz, String resource, Charset charSet) throws IOException {
+ InputStream stream = null;
+ boolean success = false;
+ try {
+ stream = clazz
+ .getResourceAsStream(resource);
+ final Reader reader = getDecodingReader(stream, charSet);
+ success = true;
+ return reader;
+ } finally {
+ if (!success) {
+ IOUtils.close(stream);
+ }
+ }
+ }
+
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 2ba5315..23ed34b 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -64,9 +65,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet(
- Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class,
- DEFAULT_STOPWORD_FILE, "#"), false));
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
index 75fb8c4..f83f3a7 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
@@ -57,8 +57,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
"they", "this", "to", "was", "will", "with"
);
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
- stopWords.size(), false);
- stopSet.addAll(stopWords);
+ stopWords, false);
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
}
@@ -82,7 +81,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion See above
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
+ this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -90,7 +89,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion See above
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 0df03a1..ba845ff 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*;
@@ -70,9 +71,8 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
- Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class,
- DEFAULT_STOPWORD_FILE, "#"), false));
+ DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
index 65505dc..c94676a 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.DanishStemmer;
@@ -62,8 +63,8 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 2c69900..9abde8c 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.German2Stemmer;
@@ -100,8 +101,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
private static final Set> DEFAULT_SET;
static {
try {
- DEFAULT_SET =
- WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
index 025415d..7be2b70 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SpanishStemmer;
@@ -62,8 +63,8 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
index 85a0e59..caf5927 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.FinnishStemmer;
@@ -62,8 +63,8 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 087f6a1..8d0c4a1 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.IOException;
@@ -118,8 +119,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
static final Set> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET =
- WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
index 60dc7c3..7ce43f1 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -60,12 +61,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
- throw new RuntimeException("Unable to load default stopword set");
+ throw new RuntimeException("Unable to load default stopword set", ex);
}
}
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
index be3a879..a927009 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.HungarianStemmer;
@@ -62,8 +63,8 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
index 22790bb..4e90116 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
@@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.ItalianStemmer;
@@ -79,8 +80,8 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
index d0ff1e1..370e706 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
@@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -60,8 +62,8 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index 3931fa1..312242f 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@@ -83,8 +84,8 @@ public final class DutchAnalyzer extends Analyzer {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
index ecb66f6..00403f1 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.NorwegianStemmer;
@@ -62,8 +63,8 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
index 3d28933..853f423 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.PortugueseStemmer;
@@ -62,8 +63,8 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index 6ddf665..247bdf6 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -84,12 +85,12 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
static {
try {
- DEFAULT_STOP_SET =
- WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
- throw new RuntimeException("Unable to load default stopword set");
+ throw new RuntimeException("Unable to load default stopword set", ex);
}
}
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
index 9c9821d..dc3f0a6 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@@ -85,7 +86,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
* above}
* @param stopwords File to read stop words from */
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -94,7 +95,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
* above}
* @param stopwords Reader to read stop words from */
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index cf0011d..96b7e8c 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@@ -86,7 +87,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
* above}
* @param stopwords File to read stop words from */
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -95,7 +96,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
* above}
* @param stopwords Reader to read stop words from */
public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
index 7dd1702..b1f9442 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SwedishStemmer;
@@ -62,8 +63,8 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
index c99dc54..ba85a49 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
@@ -17,10 +17,13 @@
package org.apache.lucene.analysis.util;
+import java.io.File;
import java.io.IOException;
+import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -93,11 +96,59 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
final Class extends Analyzer> aClass, final String resource,
final String comment) throws IOException {
- final Set
+ * The snowball format is the following: + *