Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/StopawareAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/StopawareAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/StopawareAnalyzer.java (revision 0)
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
+
+/**
+ * Abstract base class for analyzers using stopwords in their token stream.
+ *
+ */
+public abstract class StopawareAnalyzer extends AbstractAnalyzer {
+
+ /**
+ * An immutable stopword set
+ */
+ protected final CharArraySet stopwords;
+
+ protected final Version matchVersion;
+
+ /**
+ * Returns the analyzers stopword set or an empty set if the analyzer as no
+ * stopwords
+ *
+ * @return the analyzers stopword set or an empty set if the analyzer as no
+ * stopwords
+ */
+ public Set> getStopwordSet() {
+ return stopwords;
+ }
+
+ /**
+ * Creates a new instance initialized with the given stopword set
+ *
+ * @param version
+ * the Lucene version for cross version compatibility
+ * @param stopwords
+ * the analzers stopword set
+ */
+ protected StopawareAnalyzer(final Version version, final Set> stopwords) {
+ /*
+ * no need to call
+ * setOverridesTokenStreamMethod(AbstractContribAnalyzer.class); here, both
+ * tokenStream methods are final in this class.
+ */
+ matchVersion = version;
+ // analyzers should use char array set for stopwords!
+ this.stopwords = CharArraySet
+ .unmodifiableSet(stopwords == null ? CharArraySet.EMTPY_SET
+ : stopwords instanceof CharArraySet ? (CharArraySet) stopwords
+ : new CharArraySet(stopwords, true));
+ }
+
+ /**
+ * Creates a new Analyzer with an empty stopword set
+ *
+ * @param version
+ * the Lucene version for cross version compatibility
+ */
+ protected StopawareAnalyzer(final Version version) {
+ this(version, null);
+ }
+
+ /**
+ * Creates a CharArraySet from a file resource associated with a class. (See
+ * {@link Class#getResourceAsStream(String)}).
+ *
+ * @param ignoreCase
+ * true if the set should ignore the case of the
+ * stopwords, otherwise false
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param resource
+ * name of the resource file associated with the given class
+ * @param comment
+ * comment string to ignore in the stopword file
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+ final Class extends AbstractAnalyzer> aClass, final String resource,
+ final String comment) throws IOException {
+ final Set wordSet = WordlistLoader.getWordSet(aClass, resource,
+ comment);
+ final CharArraySet set = new CharArraySet(wordSet.size(), ignoreCase);
+ set.addAll(wordSet);
+ return set;
+ }
+
+}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (revision 834008)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (working copy)
@@ -19,18 +19,16 @@
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
-import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AbstractAnalyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.StopawareAnalyzer;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.Version;
@@ -52,7 +50,7 @@
*
*
*/
-public final class ArabicAnalyzer extends Analyzer {
+public final class ArabicAnalyzer extends StopawareAnalyzer {
/**
* File containing default Arabic stopwords.
@@ -63,10 +61,6 @@
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set> stoptable;
- /**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
*/
public static final String STOPWORDS_COMMENT = "#";
@@ -75,7 +69,7 @@
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set getDefaultStopSet(){
+ public static Set> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -84,11 +78,11 @@
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set DEFAULT_STOP_SET;
+ static final Set> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
+ DEFAULT_STOP_SET = loadStopwordSet(true, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
@@ -96,106 +90,56 @@
}
}
- static Set loadDefaultStopWordSet() throws IOException {
- InputStream stream = ArabicAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- stream.close();
- }
- }
+
}
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public ArabicAnalyzer(Version matchVersion) {
- this.matchVersion = matchVersion;
- stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
+ super(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public ArabicAnalyzer( Version matchVersion, Set> stopwords) {
+ super(matchVersion, stopwords);
+ }
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, String... stopwords ) {
- stoptable = StopFilter.makeStopSet( stopwords );
- this.matchVersion = matchVersion;
+ super(matchVersion, StopFilter.makeStopSet( stopwords ));
}
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, Hashtable,?> stopwords ) {
- stoptable = new HashSet(stopwords.keySet());
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
+ * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, File stopwords ) throws IOException {
- stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
- this.matchVersion = matchVersion;
+ super(matchVersion, WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT));
}
-
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
- * and {@link ArabicStemFilter}.
- */
+
@Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ArabicLetterTokenizer( reader );
- result = new LowerCaseFilter(result);
- // the order here is important: the stopword list is not normalized!
- result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
- result, stoptable );
- result = new ArabicNormalizationFilter( result );
- result = new ArabicStemFilter( result );
-
- return result;
+ protected TokenStreamComponents createComponents(String fieldName, Reader aReader) {
+ Tokenizer source = new ArabicLetterTokenizer(aReader);
+ TokenFilter result = new LowerCaseFilter(source);
+ // the order here is important: the stopword list is not normalized!
+ result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
+ result, this.stopwords);
+ result = new ArabicNormalizationFilter(result);
+ result = new ArabicStemFilter(result);
+ return new TokenStreamComponents(source, result);
}
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
- * and {@link ArabicStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(streams.source);
- // the order here is important: the stopword list is not normalized!
- streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
- streams.result, stoptable);
- streams.result = new ArabicNormalizationFilter(streams.result);
- streams.result = new ArabicStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
}
-
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (revision 834008)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (working copy)
@@ -20,15 +20,18 @@
import java.io.File;
import java.io.IOException;
import java.io.Reader;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.StopawareAnalyzer;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
@@ -47,11 +50,14 @@
*
NOTE: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.
*/
-public final class BrazilianAnalyzer extends Analyzer {
+public final class BrazilianAnalyzer extends StopawareAnalyzer {
/**
* List of typical Brazilian Portuguese stopwords.
+ * @deprecated use {@link #getDefaultStopSet()} instead
*/
+ // TODO make this private instead of removing
+ // it once deprecation are ready to go away
public final static String[] BRAZILIAN_STOP_WORDS = {
"a","ainda","alem","ambas","ambos","antes",
"ao","aonde","aos","apos","aquele","aqueles",
@@ -73,120 +79,113 @@
"suas","tal","tambem","teu","teus","toda","todas","todo",
"todos","tua","tuas","tudo","um","uma","umas","uns"};
+ private static final Set> DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(
+ new CharArraySet(Arrays.asList(BRAZILIAN_STOP_WORDS), true));
/**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private Set stoptable = Collections.emptySet();
-
+ * Returns an unmodifiable instance of the default stop-words set.
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static Set> getDefaultStopSet(){
+ return DEFAULT_STOP_SET;
+ }
/**
* Contains words that should be indexed but not stemmed.
*/
- private Set excltable = Collections.emptySet();
- private final Version matchVersion;
+ // TODO make this final once deprecation are removed
+ private Set> stemmExlusion = Collections.emptySet();
/**
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
*/
public BrazilianAnalyzer(Version matchVersion) {
- stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
- this.matchVersion = matchVersion;
+ super(matchVersion, DEFAULT_STOP_SET);
}
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public BrazilianAnalyzer(Version matchVersion, Set> stopwords){
+ this(matchVersion, stopwords, Collections.emptySet());
+ }
+
+ /**
+ * Builds an analyzer with the given stop words and stemming exclusion words
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclutionSet a stemming exclusion set
+ */
+ public BrazilianAnalyzer(Version matchVersion, Set> stopwords, Set> stemExclutionSet){
+ super(matchVersion, stopwords);
+ this.stemmExlusion = CharArraySet.unmodifiableSet(new CharArraySet(stemExclutionSet, true));
+ }
+
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
- public BrazilianAnalyzer( Version matchVersion, String... stopwords ) {
- stoptable = StopFilter.makeStopSet( stopwords );
- this.matchVersion = matchVersion;
+ public BrazilianAnalyzer( Version matchVersion, String... stopwords ) {
+ this(matchVersion, StopFilter.makeStopSet( stopwords ));
}
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
- public BrazilianAnalyzer( Version matchVersion, Map stopwords ) {
- stoptable = new HashSet(stopwords.keySet());
- this.matchVersion = matchVersion;
+ public BrazilianAnalyzer( Version matchVersion, Map,?> stopwords ) {
+ this(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
- public BrazilianAnalyzer( Version matchVersion, File stopwords ) throws IOException {
- stoptable = WordlistLoader.getWordSet( stopwords );
- this.matchVersion = matchVersion;
+ public BrazilianAnalyzer( Version matchVersion, File stopwords ) throws IOException {
+ this(matchVersion, WordlistLoader.getWordSet( stopwords ));
}
/**
* Builds an exclusionlist from an array of Strings.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable( String... exclusionlist ) {
- excltable = StopFilter.makeStopSet( exclusionlist );
+ stemmExlusion = Collections.unmodifiableSet(
+ StopFilter.makeStopSet( exclusionlist ));
setPreviousTokenStream(null); // force a new stemmer to be created
}
+
/**
* Builds an exclusionlist from a {@link Map}.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
- public void setStemExclusionTable( Map exclusionlist ) {
- excltable = new HashSet(exclusionlist.keySet());
+ public void setStemExclusionTable( Map,?> exclusionlist ) {
+ stemmExlusion = Collections.unmodifiableSet(
+ new HashSet
*/
-public final class PersianAnalyzer extends Analyzer {
+public final class PersianAnalyzer extends StopawareAnalyzer {
/**
* File containing default Persian stopwords.
@@ -58,11 +57,6 @@
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set stoptable;
-
- /**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
*/
@@ -72,7 +66,7 @@
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set getDefaultStopSet(){
+ public static Set> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -81,11 +75,11 @@
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set DEFAULT_STOP_SET;
+ static final CharArraySet DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
+ DEFAULT_STOP_SET = loadStopwordSet(true, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
@@ -93,69 +87,55 @@
}
}
- static Set loadDefaultStopWordSet() throws IOException {
- InputStream stream = PersianAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- stream.close();
- }
- }
}
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public PersianAnalyzer(Version matchVersion) {
- stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
- this.matchVersion = matchVersion;
+ super(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public PersianAnalyzer(Version matchVersion, Set> stopwords) {
+ super(matchVersion, stopwords);
+ }
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
public PersianAnalyzer(Version matchVersion, String[] stopwords) {
- stoptable = StopFilter.makeStopSet(stopwords);
- this.matchVersion = matchVersion;
+ super(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
- public PersianAnalyzer(Version matchVersion, Hashtable stopwords) {
- stoptable = new HashSet(stopwords.keySet());
- this.matchVersion = matchVersion;
+ public PersianAnalyzer(Version matchVersion, Hashtable,?> stopwords) {
+ super(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words. Lines can be commented out
* using {@link #STOPWORDS_COMMENT}
+ * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException {
- stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
- this.matchVersion = matchVersion;
+ super(matchVersion, WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT));
}
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
- * {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
- * filtered with {@link LowerCaseFilter},
- * {@link ArabicNormalizationFilter},
- * {@link PersianNormalizationFilter} and Persian Stop words
- */
+
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ArabicLetterTokenizer(reader);
- result = new LowerCaseFilter(result);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader aReader) {
+ Tokenizer tokenizer = new ArabicLetterTokenizer(aReader);
+ TokenFilter result = new LowerCaseFilter(tokenizer);
+
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
@@ -163,46 +143,10 @@
* the order here is important: the stopword list is normalized with the
* above!
*/
- result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
- result, stoptable);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- }
+ result = new StopFilter(StopFilter
+ .getEnablePositionIncrementsVersionDefault(matchVersion), result,
+ this.stopwords);
+ return new TokenStreamComponents(tokenizer, result);
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
- * filtered with {@link LowerCaseFilter},
- * {@link ArabicNormalizationFilter},
- * {@link PersianNormalizationFilter} and Persian Stop words
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(streams.source);
- streams.result = new ArabicNormalizationFilter(streams.result);
- /* additional persian-specific normalization */
- streams.result = new PersianNormalizationFilter(streams.result);
- /*
- * the order here is important: the stopword list is normalized with the
- * above!
- */
- streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
- streams.result, stoptable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
}
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (revision 832889)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (working copy)
@@ -17,10 +17,10 @@
* limitations under the License.
*/
-import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
@@ -78,7 +78,9 @@
* Test that custom stopwords work, and are not case-sensitive.
*/
public void testCustomStopwords() throws Exception {
- ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
+ Set set = new HashSet();
+ Collections.addAll(set, "the", "and", "a");
+ ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" });
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 832889)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (working copy)
@@ -17,10 +17,12 @@
* limitations under the License.
*/
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
@@ -132,9 +134,12 @@
}
public void testStemExclusionTable() throws Exception {
- BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
- a.setStemExclusionTable(new String[] { "quintessência" });
- checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
+ BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT, Collections.emptySet(),
+ new HashSet(Arrays.asList(new String[] { "quintessência" })));
+ checkOneTerm(a, "bobalhões", "bobalho");
+ checkOneTerm(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
+ a.setStemExclusionTable(new String[] { "bobalhões" });
+ checkOneTerm(a, "bobalhões", "bobalhões"); // excluded words will be completely unchanged.
}
/*
@@ -142,10 +147,12 @@
* when using reusable token streams.
*/
public void testExclusionTableReuse() throws Exception {
- BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "quintessência", "quintessente");
- a.setStemExclusionTable(new String[] { "quintessência" });
+ BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT, Collections.emptySet(),
+ new HashSet(Arrays.asList(new String[] { "quintessência" })));
+ checkReuse(a, "bobalhões", "bobalho");
checkReuse(a, "quintessência", "quintessência");
+ a.setStemExclusionTable(new String[] { "bobalhões" });
+ checkOneTerm(a, "bobalhões", "bobalhões"); // excluded words will be completely unchanged.
}
private void check(final String input, final String expected) throws Exception {
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 832889)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy)
@@ -25,9 +25,8 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -76,9 +75,13 @@
super(matchVersion);
}
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new WhitespaceTokenizer(reader);
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader aReader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(aReader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
}
+
}
public void testLUCENE1678BWComp() throws Exception {
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (revision 832889)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (working copy)
@@ -17,11 +17,8 @@
* limitations under the License.
*/
-import java.io.StringReader;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
Index: src/java/org/apache/lucene/analysis/AbstractAnalyzer.java
===================================================================
--- src/java/org/apache/lucene/analysis/AbstractAnalyzer.java (revision 0)
+++ src/java/org/apache/lucene/analysis/AbstractAnalyzer.java (revision 0)
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * An convenience sub-class of Analyzer to unify the creation of
+ * {@link TokenStream} instances. For historical reasons {@link Analyzer}
+ * provides different ways to create {@link TokenStream} instances in its
+ * subclasses. Derivative analyzers commonly implement
+ * {@link #reusableTokenStream(String, Reader)} as well as
+ * {@link #tokenStream(String, Reader) which immediately causes code
+ * duplication.
+ *
When overriding {@link #reusableTokenStream(String, Reader)} additional
+ * logic to reset a already instantiated {@link TokenStream} is required and
+ * often reinvented in {@link Analyzer} subclasses. This class unifies the
+ * creation of {@link TokenStream} instances and automatically reuses token
+ * stream instances if they are can be reseted.
+ *
+ *
+ * To prevent any possible issues with the new Token-Stream API this class does
+ * not allow subclasses to extend {@link #reusableTokenStream(String, Reader)}
+ * and {@link #tokenStream(String, Reader)}. Instead it requires the abstract
+ * method {@link #createComponents(String, Reader)} to be implemented. The
+ * returned {@link TokenStreamComponents} wrapper encapsulates all information
+ * to generalize the abstract {@link Analyzer} methods.
+ *
+ *
+ */
+public abstract class AbstractAnalyzer extends Analyzer {
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance for this analyzer.
+ *
+ * @param fieldName
+ * the name of the fields content passed to the
+ * {@link TokenStreamComponents} sink as a reader
+ * @param aReader
+ * the reader passed to the {@link Tokenizer} constructor
+ * @return the {@link TokenStreamComponents} for this analyzer.
+ */
+ protected abstract TokenStreamComponents createComponents(String fieldName,
+ Reader aReader);
+
+ /**
+ * This method uses {@link #createComponents(String, Reader)} to obtain an
+ * instance of {@link TokenStreamComponents} and returns the sink of the
+ * components and stores the components internally. Subsequent calls to this
+ * method will reuse the previously stored components if and only if the
+ * {@link TokenStreamComponents#reset(Reader)} method returned
+ * true. Otherwise a new instance of
+ * {@link TokenStreamComponents} is created.
+ */
+ @Override
+ public final TokenStream reusableTokenStream(final String fieldName,
+ final Reader reader) throws IOException {
+ TokenStreamComponents streamChain = (TokenStreamComponents) getPreviousTokenStream();
+ if (streamChain == null || !streamChain.reset(reader)) {
+ streamChain = createComponents(fieldName, reader);
+ setPreviousTokenStream(streamChain);
+ }
+ return streamChain.getTokenStream();
+ }
+
+ @Override
+ public final TokenStream tokenStream(final String fieldName,
+ final Reader reader) {
+ return createComponents(fieldName, reader).getTokenStream();
+ }
+
+ /**
+ * This class encapsulates the outer components of a token stream. It provides
+ * access to the source ({@link Tokenizer}) and the outer end (sink), an
+ * instance of {@link TokenFilter} which also serves as the
+ * {@link TokenStream} returned by
+ * {@link Analyzer#tokenStream(String, Reader)} and
+ * {@link Analyzer#reusableTokenStream(String, Reader)}.
+ */
+ public static class TokenStreamComponents {
+ final Tokenizer source;
+ final TokenStream sink;
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance.
+ *
+ * @param source
+ * the analyzers tokenizer
+ * @param result
+ * the analyzers resulting token stream
+ */
+ public TokenStreamComponents(final Tokenizer source,
+ final TokenStream result) {
+ this.source = source;
+ this.sink = result;
+ }
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance.
+ *
+ * @param source
+ * the analyzers tokenizer
+ */
+ public TokenStreamComponents(final Tokenizer source) {
+ this.source = source;
+ this.sink = source;
+ }
+
+ /**
+ * Resets the encapsulated components with the given reader. This method by
+ * default returns true indicating that the components have
+ * been reset successfully. Subclasses of {@link AbstractAnalyzer} might use
+ * their own {@link TokenStreamComponents} returning false if
+ * the components can not be reset.
+ *
+ * @param reader
+ * a reader to reset the source component
+ * @return true if the components were reset, otherwise
+ * false
+ * @throws IOException
+ * if the components reset method throws an {@link IOException}
+ */
+ protected boolean reset(final Reader reader) throws IOException {
+ source.reset(reader);
+ if(sink != source)
+ sink.reset(); // only reset if the sink reference is different from source
+ return true;
+ }
+
+ /**
+ * Returns the sink {@link TokenStream}
+ *
+ * @return the sink {@link TokenStream}
+ */
+ protected TokenStream getTokenStream() {
+ return sink;
+ }
+
+ }
+
+}
Index: src/java/org/apache/lucene/analysis/CharArraySet.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharArraySet.java (revision 834007)
+++ src/java/org/apache/lucene/analysis/CharArraySet.java (working copy)
@@ -43,6 +43,9 @@
*/
public class CharArraySet extends AbstractSet {
+
+ public static final CharArraySet EMTPY_SET = CharArraySet
+ .unmodifiableSet(new CharArraySet(Collections.emptySet(), false));
private final static int INIT_SIZE = 8;
private char[][] entries;
private int count;
Index: src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
===================================================================
--- src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (revision 834007)
+++ src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (working copy)
@@ -18,25 +18,15 @@
*/
import java.io.Reader;
-import java.io.IOException;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter} */
-public final class SimpleAnalyzer extends Analyzer {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new LowerCaseTokenizer(reader);
- }
+public final class SimpleAnalyzer extends AbstractAnalyzer {
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
- if (tokenizer == null) {
- tokenizer = new LowerCaseTokenizer(reader);
- setPreviousTokenStream(tokenizer);
- } else
- tokenizer.reset(reader);
- return tokenizer;
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ return new TokenStreamComponents(new LowerCaseTokenizer(reader));
}
}
Index: src/java/org/apache/lucene/analysis/StopAnalyzer.java
===================================================================
--- src/java/org/apache/lucene/analysis/StopAnalyzer.java (revision 834007)
+++ src/java/org/apache/lucene/analysis/StopAnalyzer.java (working copy)
@@ -36,7 +36,7 @@
*
*/
-public final class StopAnalyzer extends Analyzer {
+public final class StopAnalyzer extends AbstractAnalyzer {
private final Set> stopWords;
private final boolean enablePositionIncrements;
@@ -92,28 +92,12 @@
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
}
- /** Filters LowerCaseTokenizer with StopFilter. */
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ final Tokenizer source = new LowerCaseTokenizer(reader);
+ return new TokenStreamComponents(source, new StopFilter(
+ enablePositionIncrements, source, stopWords));
}
-
- /** Filters LowerCaseTokenizer with StopFilter. */
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new LowerCaseTokenizer(reader);
- streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
- setPreviousTokenStream(streams);
- } else
- streams.source.reset(reader);
- return streams.result;
- }
}
Index: src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
===================================================================
--- src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (revision 834007)
+++ src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (working copy)
@@ -18,24 +18,14 @@
*/
import java.io.Reader;
-import java.io.IOException;
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
-public final class WhitespaceAnalyzer extends Analyzer {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new WhitespaceTokenizer(reader);
- }
+public final class WhitespaceAnalyzer extends AbstractAnalyzer {
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
- if (tokenizer == null) {
- tokenizer = new WhitespaceTokenizer(reader);
- setPreviousTokenStream(tokenizer);
- } else
- tokenizer.reset(reader);
- return tokenizer;
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ return new TokenStreamComponents(new WhitespaceTokenizer(reader));
}
}
Index: src/java/org/apache/lucene/analysis/WordlistLoader.java
===================================================================
--- src/java/org/apache/lucene/analysis/WordlistLoader.java (revision 834007)
+++ src/java/org/apache/lucene/analysis/WordlistLoader.java (working copy)
@@ -21,16 +21,70 @@
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Set;
/**
* Loader for text files that represent a list of stopwords.
*/
public class WordlistLoader {
-
+
/**
+ * Loads a text file associated with a given class (See
+ * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
+ * to a {@link Set} (omitting leading and trailing whitespace). Every line of
+ * the file should contain only one word. The words need to be in lower-case if
+ * you make use of an Analyzer which uses LowerCaseFilter (like
+ * StandardAnalyzer).
+ *
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param stopwordResource
+ * name of the resource file associated with the given class
+ * @return a {@link Set} with the file's words
+ */
+ public static Set getWordSet(Class> aClass, String stopwordResource)
+ throws IOException {
+ final Reader reader = new BufferedReader(new InputStreamReader(aClass
+ .getResourceAsStream(stopwordResource), "UTF-8"));
+ try {
+ return getWordSet(reader);
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
+ * Loads a text file associated with a given class (See
+ * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
+ * to a {@link Set} (omitting leading and trailing whitespace). Every line of
+ * the file should contain only one word. The words need to be in lower-case if
+ * you make use of an Analyzer which uses LowerCaseFilter (like
+ * StandardAnalyzer).
+ *
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param stopwordResource
+ * name of the resource file associated with the given class
+ * @param comment
+ * the comment string to ignore
+ * @return a {@link Set} with the file's words
+ */
+ public static Set getWordSet(Class> aClass,
+ String stopwordResource, String comment) throws IOException {
+ final Reader reader = new BufferedReader(new InputStreamReader(aClass
+ .getResourceAsStream(stopwordResource), "UTF-8"));
+ try {
+ return getWordSet(reader, comment);
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
@@ -40,17 +94,15 @@
* @return A HashSet with the file's words
*/
public static HashSet getWordSet(File wordfile) throws IOException {
- HashSet result = new HashSet();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
- result = getWordSet(reader);
+ return getWordSet(reader);
}
finally {
if (reader != null)
reader.close();
}
- return result;
}
/**
@@ -64,17 +116,15 @@
* @return A HashSet with the file's words
*/
public static HashSet getWordSet(File wordfile, String comment) throws IOException {
- HashSet result = new HashSet();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
- result = getWordSet(reader, comment);
+ return getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
- return result;
}
@@ -88,7 +138,7 @@
* @return A HashSet with the reader's words
*/
public static HashSet getWordSet(Reader reader) throws IOException {
- HashSet result = new HashSet();
+ final HashSet result = new HashSet();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
@@ -119,7 +169,7 @@
* @return A HashSet with the reader's words
*/
public static HashSet getWordSet(Reader reader, String comment) throws IOException {
- HashSet result = new HashSet();
+ final HashSet result = new HashSet();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
@@ -154,21 +204,18 @@
public static HashMap getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
- HashMap result = new HashMap();
+ final HashMap result = new HashMap();
BufferedReader br = null;
- FileReader fr = null;
+
try {
- fr = new FileReader(wordstemfile);
- br = new BufferedReader(fr);
+ br = new BufferedReader(new FileReader(wordstemfile));
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
- if (fr != null)
- fr.close();
- if (br != null)
+ if(br != null)
br.close();
}
return result;
Index: src/test/org/apache/lucene/index/TestWordlistLoader.java
===================================================================
--- src/test/org/apache/lucene/index/TestWordlistLoader.java (revision 834007)
+++ src/test/org/apache/lucene/index/TestWordlistLoader.java (working copy)
@@ -18,34 +18,56 @@
*/
import java.io.BufferedReader;
+import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
+import java.util.Set;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.WordlistLoader;
public class TestWordlistLoader extends LuceneTestCase {
+
+ public void testWordlistLoadingByResource() throws IOException {
+
+ Set wordSet1 = WordlistLoader.getWordSet(
+ TestWordlistLoader.class, "wordliststopwords.txt","#");
+ checkSet(wordSet1);
+ Set wordSet2 = WordlistLoader.getWordSet(
+ TestWordlistLoader.class, "wordliststopwords_nocomment.txt");
+ checkSet(wordSet2);
+ }
+
+ public void testWordlistFileLoading() throws IOException {
+
+ Set wordSet1 = WordlistLoader.getWordSet(new File(
+ this.getClass().getResource("wordliststopwords_nocomment.txt")
+ .getFile()));
+ checkSet(wordSet1);
+ }
public void testWordlistLoading() throws IOException {
String s = "ONE\n two \nthree";
- HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
+ HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
checkSet(wordSet1);
- HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
+ HashSet wordSet2 = WordlistLoader.getWordSet(
+ new BufferedReader(new StringReader(s)));
checkSet(wordSet2);
}
public void testComments() throws Exception {
String s = "ONE\n two \nthree\n#comment";
- HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+ HashSet wordSet1 = WordlistLoader.getWordSet(
+ new StringReader(s), "#");
checkSet(wordSet1);
assertFalse(wordSet1.contains("#comment"));
assertFalse(wordSet1.contains("comment"));
}
- private void checkSet(HashSet wordset) {
+ private void checkSet(Set wordset) {
assertEquals(3, wordset.size());
assertTrue(wordset.contains("ONE")); // case is not modified
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
Index: src/test/org/apache/lucene/index/wordliststopwords.txt
===================================================================
--- src/test/org/apache/lucene/index/wordliststopwords.txt (revision 0)
+++ src/test/org/apache/lucene/index/wordliststopwords.txt (revision 0)
@@ -0,0 +1,5 @@
+#comment
+ONE
+two
+#comment
+three
Index: src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt
===================================================================
--- src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt (revision 0)
+++ src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt (revision 0)
@@ -0,0 +1,3 @@
+ONE
+two
+three