Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/StopawareAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/StopawareAnalyzer.java (revision 0) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/StopawareAnalyzer.java (revision 0) @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.util.Version; + +/** + * Abstract base class for analyzers using stopwords in their token stream. + * + */ +public abstract class StopawareAnalyzer extends AbstractAnalyzer { + + /** + * An immutable stopword set + */ + protected final CharArraySet stopwords; + + protected final Version matchVersion; + + /** + * Returns the analyzers stopword set or an empty set if the analyzer as no + * stopwords + * + * @return the analyzers stopword set or an empty set if the analyzer as no + * stopwords + */ + public Set getStopwordSet() { + return stopwords; + } + + /** + * Creates a new instance initialized with the given stopword set + * + * @param version + * the Lucene version for cross version compatibility + * @param stopwords + * the analzers stopword set + */ + protected StopawareAnalyzer(final Version version, final Set stopwords) { + /* + * no need to call + * setOverridesTokenStreamMethod(AbstractContribAnalyzer.class); here, both + * tokenStream methods are final in this class. + */ + matchVersion = version; + // analyzers should use char array set for stopwords! + this.stopwords = CharArraySet + .unmodifiableSet(stopwords == null ? CharArraySet.EMTPY_SET + : stopwords instanceof CharArraySet ? (CharArraySet) stopwords + : new CharArraySet(stopwords, true)); + } + + /** + * Creates a new Analyzer with an empty stopword set + * + * @param version + * the Lucene version for cross version compatibility + */ + protected StopawareAnalyzer(final Version version) { + this(version, null); + } + + /** + * Creates a CharArraySet from a file resource associated with a class. (See + * {@link Class#getResourceAsStream(String)}). + * + * @param ignoreCase + * true if the set should ignore the case of the + * stopwords, otherwise false + * @param aClass + * a class that is associated with the given stopwordResource + * @param resource + * name of the resource file associated with the given class + * @param comment + * comment string to ignore in the stopword file + * @return a CharArraySet containing the distinct stopwords from the given + * file + * @throws IOException + * if loading the stopwords throws an {@link IOException} + */ + protected static CharArraySet loadStopwordSet(final boolean ignoreCase, + final Class aClass, final String resource, + final String comment) throws IOException { + final Set wordSet = WordlistLoader.getWordSet(aClass, resource, + comment); + final CharArraySet set = new CharArraySet(wordSet.size(), ignoreCase); + set.addAll(wordSet); + return set; + } + +} Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (revision 834008) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (working copy) @@ -19,18 +19,16 @@ import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.Collections; -import java.util.HashSet; import java.util.Hashtable; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AbstractAnalyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopawareAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.util.Version; @@ -52,7 +50,7 @@ * * */ -public final class ArabicAnalyzer extends Analyzer { +public final class ArabicAnalyzer extends StopawareAnalyzer { /** * File containing default Arabic stopwords. @@ -63,10 +61,6 @@ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; /** - * Contains the stopwords used with the StopFilter. - */ - private final Set stoptable; - /** * The comment character in the stopwords file. All lines prefixed with this will be ignored */ public static final String STOPWORDS_COMMENT = "#"; @@ -75,7 +69,7 @@ * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static Set getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -84,11 +78,11 @@ * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final Set DEFAULT_STOP_SET; static { try { - DEFAULT_STOP_SET = loadDefaultStopWordSet(); + DEFAULT_STOP_SET = loadStopwordSet(true, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) @@ -96,106 +90,56 @@ } } - static Set loadDefaultStopWordSet() throws IOException { - InputStream stream = ArabicAnalyzer.class - .getResourceAsStream(DEFAULT_STOPWORD_FILE); - try { - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, - STOPWORDS_COMMENT)); - } finally { - stream.close(); - } - } + } - private final Version matchVersion; - /** * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ public ArabicAnalyzer(Version matchVersion) { - this.matchVersion = matchVersion; - stoptable = DefaultSetHolder.DEFAULT_STOP_SET; + super(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); } + + /** + * Builds an analyzer with the given stop words. + */ + public ArabicAnalyzer( Version matchVersion, Set stopwords) { + super(matchVersion, stopwords); + } /** * Builds an analyzer with the given stop words. + * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead */ public ArabicAnalyzer( Version matchVersion, String... stopwords ) { - stoptable = StopFilter.makeStopSet( stopwords ); - this.matchVersion = matchVersion; + super(matchVersion, StopFilter.makeStopSet( stopwords )); } /** * Builds an analyzer with the given stop words. + * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead */ public ArabicAnalyzer( Version matchVersion, Hashtable stopwords ) { - stoptable = new HashSet(stopwords.keySet()); - this.matchVersion = matchVersion; + super(matchVersion, stopwords.keySet()); } /** * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} + * @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead */ public ArabicAnalyzer( Version matchVersion, File stopwords ) throws IOException { - stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT); - this.matchVersion = matchVersion; + super(matchVersion, WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT)); } - - - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} - * and {@link ArabicStemFilter}. - */ + @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new ArabicLetterTokenizer( reader ); - result = new LowerCaseFilter(result); - // the order here is important: the stopword list is not normalized! - result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - result, stoptable ); - result = new ArabicNormalizationFilter( result ); - result = new ArabicStemFilter( result ); - - return result; + protected TokenStreamComponents createComponents(String fieldName, Reader aReader) { + Tokenizer source = new ArabicLetterTokenizer(aReader); + TokenFilter result = new LowerCaseFilter(source); + // the order here is important: the stopword list is not normalized! + result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), + result, this.stopwords); + result = new ArabicNormalizationFilter(result); + result = new ArabicStemFilter(result); + return new TokenStreamComponents(source, result); } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} - * and {@link ArabicStemFilter}. - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new ArabicLetterTokenizer(reader); - streams.result = new LowerCaseFilter(streams.source); - // the order here is important: the stopword list is not normalized! - streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stoptable); - streams.result = new ArabicNormalizationFilter(streams.result); - streams.result = new ArabicStemFilter(streams.result); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } } - Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (revision 834008) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (working copy) @@ -20,15 +20,18 @@ import java.io.File; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.Collections; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopawareAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc @@ -47,11 +50,14 @@ *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class BrazilianAnalyzer extends Analyzer { +public final class BrazilianAnalyzer extends StopawareAnalyzer { /** * List of typical Brazilian Portuguese stopwords. + * @deprecated use {@link #getDefaultStopSet()} instead */ + // TODO make this private instead of removing + // it once deprecation are ready to go away public final static String[] BRAZILIAN_STOP_WORDS = { "a","ainda","alem","ambas","ambos","antes", "ao","aonde","aos","apos","aquele","aqueles", @@ -73,120 +79,113 @@ "suas","tal","tambem","teu","teus","toda","todas","todo", "todos","tua","tuas","tudo","um","uma","umas","uns"}; + private static final Set DEFAULT_STOP_SET = CharArraySet.unmodifiableSet( + new CharArraySet(Arrays.asList(BRAZILIAN_STOP_WORDS), true)); /** - * Contains the stopwords used with the {@link StopFilter}. - */ - private Set stoptable = Collections.emptySet(); - + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static Set getDefaultStopSet(){ + return DEFAULT_STOP_SET; + } /** * Contains words that should be indexed but not stemmed. */ - private Set excltable = Collections.emptySet(); - private final Version matchVersion; + // TODO make this final once deprecation are removed + private Set stemmExlusion = Collections.emptySet(); /** * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}). */ public BrazilianAnalyzer(Version matchVersion) { - stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS ); - this.matchVersion = matchVersion; + super(matchVersion, DEFAULT_STOP_SET); } + + /** + * Builds an analyzer with the given stop words. + * @param matchVersion lucene compatibility version + * @param stopwords a stopword set + */ + public BrazilianAnalyzer(Version matchVersion, Set stopwords){ + this(matchVersion, stopwords, Collections.emptySet()); + } + + /** + * Builds an analyzer with the given stop words and stemming exclusion words + * @param matchVersion lucene compatibility version + * @param stopwords a stopword set + * @param stemExclutionSet a stemming exclusion set + */ + public BrazilianAnalyzer(Version matchVersion, Set stopwords, Set stemExclutionSet){ + super(matchVersion, stopwords); + this.stemmExlusion = CharArraySet.unmodifiableSet(new CharArraySet(stemExclutionSet, true)); + } + /** * Builds an analyzer with the given stop words. + * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead */ - public BrazilianAnalyzer( Version matchVersion, String... stopwords ) { - stoptable = StopFilter.makeStopSet( stopwords ); - this.matchVersion = matchVersion; + public BrazilianAnalyzer( Version matchVersion, String... stopwords ) { + this(matchVersion, StopFilter.makeStopSet( stopwords )); } /** * Builds an analyzer with the given stop words. + * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead */ - public BrazilianAnalyzer( Version matchVersion, Map stopwords ) { - stoptable = new HashSet(stopwords.keySet()); - this.matchVersion = matchVersion; + public BrazilianAnalyzer( Version matchVersion, Map stopwords ) { + this(matchVersion, stopwords.keySet()); } /** * Builds an analyzer with the given stop words. */ - public BrazilianAnalyzer( Version matchVersion, File stopwords ) throws IOException { - stoptable = WordlistLoader.getWordSet( stopwords ); - this.matchVersion = matchVersion; + public BrazilianAnalyzer( Version matchVersion, File stopwords ) throws IOException { + this(matchVersion, WordlistLoader.getWordSet( stopwords )); } /** * Builds an exclusionlist from an array of Strings. + * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead */ public void setStemExclusionTable( String... exclusionlist ) { - excltable = StopFilter.makeStopSet( exclusionlist ); + stemmExlusion = Collections.unmodifiableSet( + StopFilter.makeStopSet( exclusionlist )); setPreviousTokenStream(null); // force a new stemmer to be created } + /** * Builds an exclusionlist from a {@link Map}. + * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead */ - public void setStemExclusionTable( Map exclusionlist ) { - excltable = new HashSet(exclusionlist.keySet()); + public void setStemExclusionTable( Map exclusionlist ) { + stemmExlusion = Collections.unmodifiableSet( + new HashSet(exclusionlist.keySet())); setPreviousTokenStream(null); // force a new stemmer to be created } /** * Builds an exclusionlist from the words contained in the given file. + * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead */ public void setStemExclusionTable( File exclusionlist ) throws IOException { - excltable = WordlistLoader.getWordSet( exclusionlist ); + stemmExlusion = Collections.unmodifiableSet( + WordlistLoader.getWordSet( exclusionlist )); setPreviousTokenStream(null); // force a new stemmer to be created } - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and - * {@link BrazilianStemFilter}. - */ - @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer( matchVersion, reader ); - result = new LowerCaseFilter( result ); - result = new StandardFilter( result ); - result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - result, stoptable ); - result = new BrazilianStemFilter( result, excltable ); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and - * {@link BrazilianStemFilter}. - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new LowerCaseFilter(streams.source); - streams.result = new StandardFilter(streams.result); - streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stoptable); - streams.result = new BrazilianStemFilter(streams.result, excltable); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } -} + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader aReader) { + final Tokenizer source = new StandardTokenizer(matchVersion, aReader); + TokenFilter result = new LowerCaseFilter(source); + result = new StandardFilter(result); + result = new StopFilter(StopFilter + .getEnablePositionIncrementsVersionDefault(matchVersion), result, + stopwords); + result = new BrazilianStemFilter(result, stemmExlusion); + return new TokenStreamComponents(source, result); + } +} Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (revision 834008) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (working copy) @@ -18,12 +18,13 @@ */ import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopawareAnalyzer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.Version; -import java.io.IOException; import java.io.Reader; import java.util.Set; @@ -33,12 +34,12 @@ * filters with {@link StopFilter} * */ -public class CJKAnalyzer extends Analyzer { - //~ Static fields/initializers --------------------------------------------- +public class CJKAnalyzer extends StopawareAnalyzer { /** * An array containing some common English words that are not usually * useful for searching and some double-byte interpunctions. + * @deprecated Use {@link #getDefaultStopSet()} instead */ public final static String[] STOP_WORDS = { "a", "and", "are", "as", "at", "be", @@ -50,78 +51,57 @@ "to", "was", "will", "with", "", "www" }; - - //~ Instance fields -------------------------------------------------------- - + /** - * stop word list + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. */ - private final Set stopTable; - private final Version matchVersion; + public static Set getDefaultStopSet(){ + return DEFAULT_STOP_SET; + } + + private static final Set DEFAULT_STOP_SET; + + static { + Set englishStopWordsSet = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + CharArraySet set = new CharArraySet(englishStopWordsSet, true); + set.add("www"); + set.add("t"); + DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(set); + } - //~ Constructors ----------------------------------------------------------- - /** * Builds an analyzer which removes words in {@link #STOP_WORDS}. */ public CJKAnalyzer(Version matchVersion) { - stopTable = StopFilter.makeStopSet(STOP_WORDS); - this.matchVersion = matchVersion; + super(matchVersion, null); } + + /** + * Builds an analyzer which removes words in the given set. + */ + public CJKAnalyzer(Version matchVersion, Set stopwords) { + super(matchVersion, stopwords); + } + /** * Builds an analyzer which removes words in the provided array. * * @param stopWords stop word array + * @deprecated Use {@link #CJKAnalyzer(Version, Set)} */ public CJKAnalyzer(Version matchVersion, String... stopWords) { - stopTable = StopFilter.makeStopSet(stopWords); - this.matchVersion = matchVersion; + super(matchVersion, StopFilter.makeStopSet(stopWords)); } - //~ Methods ---------------------------------------------------------------- - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @param fieldName lucene field name - * @param reader input {@link Reader} - * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with - * {@link StopFilter} - */ @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - new CJKTokenizer(reader), stopTable); + protected TokenStreamComponents createComponents(String fieldName, + Reader aReader) { + final Tokenizer source = new CJKTokenizer(aReader); + return new TokenStreamComponents(source, new StopFilter( + StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), + source, stopwords)); } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @param fieldName lucene field name - * @param reader Input {@link Reader} - * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with - * {@link StopFilter} - */ - @Override - public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new CJKTokenizer(reader); - streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - streams.source, stopTable); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (revision 834008) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (working copy) @@ -17,10 +17,9 @@ * limitations under the License. */ -import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.AbstractAnalyzer; import org.apache.lucene.analysis.Tokenizer; /** @@ -29,49 +28,12 @@ * */ -public class ChineseAnalyzer extends Analyzer { +public class ChineseAnalyzer extends AbstractAnalyzer { - public ChineseAnalyzer() { - } - - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ChineseTokenizer} - * filtered with {@link ChineseFilter}. - */ @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new ChineseTokenizer(reader); - result = new ChineseFilter(result); - return result; + protected TokenStreamComponents createComponents(String fieldName, + Reader aReader) { + final Tokenizer source = new ChineseTokenizer(aReader); + return new TokenStreamComponents(source, new ChineseFilter(source)); } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the - * provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ChineseTokenizer} - * filtered with {@link ChineseFilter}. - */ - @Override - public final TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new ChineseTokenizer(reader); - streams.result = new ChineseFilter(streams.source); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } } \ No newline at end of file Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (revision 834008) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (working copy) @@ -21,14 +21,19 @@ import java.io.File; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AbstractAnalyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopawareAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.standard.StandardFilter; @@ -49,11 +54,13 @@ *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public class GermanAnalyzer extends Analyzer { +public class GermanAnalyzer extends StopawareAnalyzer { /** * List of typical german stopwords. + * @deprecated use {@link #getDefaultStopSet()} instead */ + //TODO this should be private final static once the deprecation is removed public final static String[] GERMAN_STOP_WORDS = { "einer", "eine", "eines", "einem", "einen", "der", "die", "das", "dass", "daß", @@ -68,58 +75,79 @@ "mein", "sein", "kein", "durch", "wegen", "wird" }; + + + private static final Set DEFAULT_STOP_WORDS = CharArraySet.unmodifiableSet( + new CharArraySet(Arrays.asList(GERMAN_STOP_WORDS), true)); /** - * Contains the stopwords used with the {@link StopFilter}. + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. */ - private Set stopSet = new HashSet(); - + public static Set getDefaultStopSet(){ + return DEFAULT_STOP_WORDS; + } /** * Contains words that should be indexed but not stemmed. */ - private Set exclusionSet = new HashSet(); + // TODO Make this final once the setters are removed + private Set exclusionSet = Collections.emptySet(); - private final Version matchVersion; /** * Builds an analyzer with the default stop words: * {@link #GERMAN_STOP_WORDS}. */ public GermanAnalyzer(Version matchVersion) { - stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS); - setOverridesTokenStreamMethod(GermanAnalyzer.class); - this.matchVersion = matchVersion; + super(matchVersion, StopFilter.makeStopSet(GERMAN_STOP_WORDS)); } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(Version matchVersion, Set stopwords){ + super(matchVersion, stopwords); + } + + /** + * Builds an analyzer with the given stop words and stemming exclusion words + * @param matchVersion lucene compatibility version + * @param stopwords a stopword set + * @param stemExclutionSet a stemming exclusion set + */ + public GermanAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet){ + super(matchVersion, stopwords); + this.exclusionSet = stemExclusionSet; + } /** * Builds an analyzer with the given stop words. + * @deprecated Use {@link #GermanAnalyzer(Version, Set)} instead */ public GermanAnalyzer(Version matchVersion, String... stopwords) { - stopSet = StopFilter.makeStopSet(stopwords); - setOverridesTokenStreamMethod(GermanAnalyzer.class); - this.matchVersion = matchVersion; + super(matchVersion, StopFilter.makeStopSet(stopwords)); + } /** * Builds an analyzer with the given stop words. + * @deprecated Use {@link #GermanAnalyzer(Version, Set)} instead */ - public GermanAnalyzer(Version matchVersion, Map stopwords) { - stopSet = new HashSet(stopwords.keySet()); - setOverridesTokenStreamMethod(GermanAnalyzer.class); - this.matchVersion = matchVersion; + public GermanAnalyzer(Version matchVersion, Map stopwords) { + super(matchVersion, stopwords.keySet()); } /** * Builds an analyzer with the given stop words. + * @deprecated Use {@link #GermanAnalyzer(Version, Set)} instead */ public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException { - stopSet = WordlistLoader.getWordSet(stopwords); - setOverridesTokenStreamMethod(GermanAnalyzer.class); - this.matchVersion = matchVersion; + super(matchVersion, WordlistLoader.getWordSet(stopwords)); } /** * Builds an exclusionlist from an array of Strings. + * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead */ public void setStemExclusionTable(String[] exclusionlist) { exclusionSet = StopFilter.makeStopSet(exclusionlist); @@ -127,74 +155,32 @@ } /** - * Builds an exclusionlist from a {@link Map} + * Builds an exclusion list from a {@link Map} + * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead */ - public void setStemExclusionTable(Map exclusionlist) { - exclusionSet = new HashSet(exclusionlist.keySet()); + public void setStemExclusionTable(Map exclusionlist) { + exclusionSet = new HashSet(exclusionlist.keySet()); setPreviousTokenStream(null); // force a new stemmer to be created } /** - * Builds an exclusionlist from the words contained in the given file. + * Builds an exclusion list from the words contained in the given file. + * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead */ public void setStemExclusionTable(File exclusionlist) throws IOException { exclusionSet = WordlistLoader.getWordSet(exclusionlist); setPreviousTokenStream(null); // force a new stemmer to be created } - - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and - * {@link GermanStemFilter} - */ - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new StandardFilter(result); - result = new LowerCaseFilter(result); - result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - result, stopSet); - result = new GermanStemFilter(result, exclusionSet); - return result; - } - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and - * {@link GermanStemFilter} - */ @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - if (overridesTokenStreamMethod) { - // LUCENE-1678: force fallback to tokenStream() if we - // have been subclassed and that subclass overrides - // tokenStream but not reusableTokenStream - return tokenStream(fieldName, reader); - } + protected TokenStreamComponents createComponents(String fieldName, + Reader aReader) { + Tokenizer source = new StandardTokenizer(matchVersion, aReader); + TokenFilter sink = new StandardFilter(source); + sink = new LowerCaseFilter(sink); + sink = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), + sink, stopwords); + return new TokenStreamComponents(source, new GermanStemFilter(sink, exclusionSet)); - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new LowerCaseFilter(streams.result); - streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stopSet); - streams.result = new GermanStemFilter(streams.result, exclusionSet); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 834008) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (working copy) @@ -19,18 +19,17 @@ import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.Collections; -import java.util.HashSet; import java.util.Hashtable; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AbstractAnalyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopawareAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; @@ -45,7 +44,7 @@ * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords. *

*/ -public final class PersianAnalyzer extends Analyzer { +public final class PersianAnalyzer extends StopawareAnalyzer { /** * File containing default Persian stopwords. @@ -58,11 +57,6 @@ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; /** - * Contains the stopwords used with the StopFilter. - */ - private final Set stoptable; - - /** * The comment character in the stopwords file. All lines prefixed with this * will be ignored */ @@ -72,7 +66,7 @@ * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static Set getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -81,11 +75,11 @@ * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { - DEFAULT_STOP_SET = loadDefaultStopWordSet(); + DEFAULT_STOP_SET = loadStopwordSet(true, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) @@ -93,69 +87,55 @@ } } - static Set loadDefaultStopWordSet() throws IOException { - InputStream stream = PersianAnalyzer.class - .getResourceAsStream(DEFAULT_STOPWORD_FILE); - try { - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, - STOPWORDS_COMMENT)); - } finally { - stream.close(); - } - } } - private final Version matchVersion; - /** * Builds an analyzer with the default stop words: * {@link #DEFAULT_STOPWORD_FILE}. */ public PersianAnalyzer(Version matchVersion) { - stoptable = DefaultSetHolder.DEFAULT_STOP_SET; - this.matchVersion = matchVersion; + super(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); } + + /** + * Builds an analyzer with the given stop words. + */ + public PersianAnalyzer(Version matchVersion, Set stopwords) { + super(matchVersion, stopwords); + } /** * Builds an analyzer with the given stop words. + * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead */ public PersianAnalyzer(Version matchVersion, String[] stopwords) { - stoptable = StopFilter.makeStopSet(stopwords); - this.matchVersion = matchVersion; + super(matchVersion, StopFilter.makeStopSet(stopwords)); } /** * Builds an analyzer with the given stop words. + * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead */ - public PersianAnalyzer(Version matchVersion, Hashtable stopwords) { - stoptable = new HashSet(stopwords.keySet()); - this.matchVersion = matchVersion; + public PersianAnalyzer(Version matchVersion, Hashtable stopwords) { + super(matchVersion, stopwords.keySet()); } /** * Builds an analyzer with the given stop words. Lines can be commented out * using {@link #STOPWORDS_COMMENT} + * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead */ public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException { - stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT); - this.matchVersion = matchVersion; + super(matchVersion, WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT)); } - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided - * {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} - * filtered with {@link LowerCaseFilter}, - * {@link ArabicNormalizationFilter}, - * {@link PersianNormalizationFilter} and Persian Stop words - */ + @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new ArabicLetterTokenizer(reader); - result = new LowerCaseFilter(result); + protected TokenStreamComponents createComponents(String fieldName, + Reader aReader) { + Tokenizer tokenizer = new ArabicLetterTokenizer(aReader); + TokenFilter result = new LowerCaseFilter(tokenizer); + result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); @@ -163,46 +143,10 @@ * the order here is important: the stopword list is normalized with the * above! */ - result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - result, stoptable); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - } + result = new StopFilter(StopFilter + .getEnablePositionIncrementsVersionDefault(matchVersion), result, + this.stopwords); + return new TokenStreamComponents(tokenizer, result); - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} - * filtered with {@link LowerCaseFilter}, - * {@link ArabicNormalizationFilter}, - * {@link PersianNormalizationFilter} and Persian Stop words - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new ArabicLetterTokenizer(reader); - streams.result = new LowerCaseFilter(streams.source); - streams.result = new ArabicNormalizationFilter(streams.result); - /* additional persian-specific normalization */ - streams.result = new PersianNormalizationFilter(streams.result); - /* - * the order here is important: the stopword list is normalized with the - * above! - */ - streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stoptable); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (revision 832889) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (working copy) @@ -17,10 +17,10 @@ * limitations under the License. */ -import java.io.StringReader; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.util.Version; @@ -78,7 +78,9 @@ * Test that custom stopwords work, and are not case-sensitive. */ public void testCustomStopwords() throws Exception { - ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" }); + Set set = new HashSet(); + Collections.addAll(set, "the", "and", "a"); + ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set); assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" }); } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 832889) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (working copy) @@ -17,10 +17,12 @@ * limitations under the License. */ +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; + import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.Version; /** @@ -132,9 +134,12 @@ } public void testStemExclusionTable() throws Exception { - BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT); - a.setStemExclusionTable(new String[] { "quintessência" }); - checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged. + BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT, Collections.emptySet(), + new HashSet(Arrays.asList(new String[] { "quintessência" }))); + checkOneTerm(a, "bobalhões", "bobalho"); + checkOneTerm(a, "quintessência", "quintessência"); // excluded words will be completely unchanged. + a.setStemExclusionTable(new String[] { "bobalhões" }); + checkOneTerm(a, "bobalhões", "bobalhões"); // excluded words will be completely unchanged. } /* @@ -142,10 +147,12 @@ * when using reusable token streams. */ public void testExclusionTableReuse() throws Exception { - BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT); - checkReuse(a, "quintessência", "quintessente"); - a.setStemExclusionTable(new String[] { "quintessência" }); + BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT, Collections.emptySet(), + new HashSet(Arrays.asList(new String[] { "quintessência" }))); + checkReuse(a, "bobalhões", "bobalho"); checkReuse(a, "quintessência", "quintessência"); + a.setStemExclusionTable(new String[] { "bobalhões" }); + checkOneTerm(a, "bobalhões", "bobalhões"); // excluded words will be completely unchanged. } private void check(final String input, final String expected) throws Exception { Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 832889) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy) @@ -25,9 +25,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.util.Version; /** @@ -76,9 +75,13 @@ super(matchVersion); } - public TokenStream tokenStream(String fieldName, Reader reader) { - return new WhitespaceTokenizer(reader); + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader aReader) { + Tokenizer tokenizer = new WhitespaceTokenizer(aReader); + return new TokenStreamComponents(tokenizer, tokenizer); } + } public void testLUCENE1678BWComp() throws Exception { Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (revision 832889) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (working copy) @@ -17,11 +17,8 @@ * limitations under the License. */ -import java.io.StringReader; - import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Version; /** Index: src/java/org/apache/lucene/analysis/AbstractAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/AbstractAnalyzer.java (revision 0) +++ src/java/org/apache/lucene/analysis/AbstractAnalyzer.java (revision 0) @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.io.Reader; + +/** + * An convenience sub-class of Analyzer to unify the creation of + * {@link TokenStream} instances. For historical reasons {@link Analyzer} + * provides different ways to create {@link TokenStream} instances in its + * subclasses. Derivative analyzers commonly implement + * {@link #reusableTokenStream(String, Reader)} as well as + * {@link #tokenStream(String, Reader) which immediately causes code + * duplication. + *

When overriding {@link #reusableTokenStream(String, Reader)} additional + * logic to reset a already instantiated {@link TokenStream} is required and + * often reinvented in {@link Analyzer} subclasses. This class unifies the + * creation of {@link TokenStream} instances and automatically reuses token + * stream instances if they are can be reseted. + *

+ *

+ * To prevent any possible issues with the new Token-Stream API this class does + * not allow subclasses to extend {@link #reusableTokenStream(String, Reader)} + * and {@link #tokenStream(String, Reader)}. Instead it requires the abstract + * method {@link #createComponents(String, Reader)} to be implemented. The + * returned {@link TokenStreamComponents} wrapper encapsulates all information + * to generalize the abstract {@link Analyzer} methods. + *

+ * + */ +public abstract class AbstractAnalyzer extends Analyzer { + + /** + * Creates a new {@link TokenStreamComponents} instance for this analyzer. + * + * @param fieldName + * the name of the fields content passed to the + * {@link TokenStreamComponents} sink as a reader + * @param aReader + * the reader passed to the {@link Tokenizer} constructor + * @return the {@link TokenStreamComponents} for this analyzer. + */ + protected abstract TokenStreamComponents createComponents(String fieldName, + Reader aReader); + + /** + * This method uses {@link #createComponents(String, Reader)} to obtain an + * instance of {@link TokenStreamComponents} and returns the sink of the + * components and stores the components internally. Subsequent calls to this + * method will reuse the previously stored components if and only if the + * {@link TokenStreamComponents#reset(Reader)} method returned + * true. Otherwise a new instance of + * {@link TokenStreamComponents} is created. + */ + @Override + public final TokenStream reusableTokenStream(final String fieldName, + final Reader reader) throws IOException { + TokenStreamComponents streamChain = (TokenStreamComponents) getPreviousTokenStream(); + if (streamChain == null || !streamChain.reset(reader)) { + streamChain = createComponents(fieldName, reader); + setPreviousTokenStream(streamChain); + } + return streamChain.getTokenStream(); + } + + @Override + public final TokenStream tokenStream(final String fieldName, + final Reader reader) { + return createComponents(fieldName, reader).getTokenStream(); + } + + /** + * This class encapsulates the outer components of a token stream. It provides + * access to the source ({@link Tokenizer}) and the outer end (sink), an + * instance of {@link TokenFilter} which also serves as the + * {@link TokenStream} returned by + * {@link Analyzer#tokenStream(String, Reader)} and + * {@link Analyzer#reusableTokenStream(String, Reader)}. + */ + public static class TokenStreamComponents { + final Tokenizer source; + final TokenStream sink; + + /** + * Creates a new {@link TokenStreamComponents} instance. + * + * @param source + * the analyzers tokenizer + * @param result + * the analyzers resulting token stream + */ + public TokenStreamComponents(final Tokenizer source, + final TokenStream result) { + this.source = source; + this.sink = result; + } + + /** + * Creates a new {@link TokenStreamComponents} instance. + * + * @param source + * the analyzers tokenizer + */ + public TokenStreamComponents(final Tokenizer source) { + this.source = source; + this.sink = source; + } + + /** + * Resets the encapsulated components with the given reader. This method by + * default returns true indicating that the components have + * been reset successfully. Subclasses of {@link AbstractAnalyzer} might use + * their own {@link TokenStreamComponents} returning false if + * the components can not be reset. + * + * @param reader + * a reader to reset the source component + * @return true if the components were reset, otherwise + * false + * @throws IOException + * if the components reset method throws an {@link IOException} + */ + protected boolean reset(final Reader reader) throws IOException { + source.reset(reader); + if(sink != source) + sink.reset(); // only reset if the sink reference is different from source + return true; + } + + /** + * Returns the sink {@link TokenStream} + * + * @return the sink {@link TokenStream} + */ + protected TokenStream getTokenStream() { + return sink; + } + + } + +} Index: src/java/org/apache/lucene/analysis/CharArraySet.java =================================================================== --- src/java/org/apache/lucene/analysis/CharArraySet.java (revision 834007) +++ src/java/org/apache/lucene/analysis/CharArraySet.java (working copy) @@ -43,6 +43,9 @@ */ public class CharArraySet extends AbstractSet { + + public static final CharArraySet EMTPY_SET = CharArraySet + .unmodifiableSet(new CharArraySet(Collections.emptySet(), false)); private final static int INIT_SIZE = 8; private char[][] entries; private int count; Index: src/java/org/apache/lucene/analysis/SimpleAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (revision 834007) +++ src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (working copy) @@ -18,25 +18,15 @@ */ import java.io.Reader; -import java.io.IOException; /** An {@link Analyzer} that filters {@link LetterTokenizer} * with {@link LowerCaseFilter} */ -public final class SimpleAnalyzer extends Analyzer { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new LowerCaseTokenizer(reader); - } +public final class SimpleAnalyzer extends AbstractAnalyzer { @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); - if (tokenizer == null) { - tokenizer = new LowerCaseTokenizer(reader); - setPreviousTokenStream(tokenizer); - } else - tokenizer.reset(reader); - return tokenizer; + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + return new TokenStreamComponents(new LowerCaseTokenizer(reader)); } } Index: src/java/org/apache/lucene/analysis/StopAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/StopAnalyzer.java (revision 834007) +++ src/java/org/apache/lucene/analysis/StopAnalyzer.java (working copy) @@ -36,7 +36,7 @@ * */ -public final class StopAnalyzer extends Analyzer { +public final class StopAnalyzer extends AbstractAnalyzer { private final Set stopWords; private final boolean enablePositionIncrements; @@ -92,28 +92,12 @@ this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); } - /** Filters LowerCaseTokenizer with StopFilter. */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords); + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + final Tokenizer source = new LowerCaseTokenizer(reader); + return new TokenStreamComponents(source, new StopFilter( + enablePositionIncrements, source, stopWords)); } - - /** Filters LowerCaseTokenizer with StopFilter. */ - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new LowerCaseTokenizer(reader); - streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords); - setPreviousTokenStream(streams); - } else - streams.source.reset(reader); - return streams.result; - } } Index: src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (revision 834007) +++ src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (working copy) @@ -18,24 +18,14 @@ */ import java.io.Reader; -import java.io.IOException; /** An Analyzer that uses {@link WhitespaceTokenizer}. */ -public final class WhitespaceAnalyzer extends Analyzer { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new WhitespaceTokenizer(reader); - } +public final class WhitespaceAnalyzer extends AbstractAnalyzer { @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); - if (tokenizer == null) { - tokenizer = new WhitespaceTokenizer(reader); - setPreviousTokenStream(tokenizer); - } else - tokenizer.reset(reader); - return tokenizer; + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + return new TokenStreamComponents(new WhitespaceTokenizer(reader)); } } Index: src/java/org/apache/lucene/analysis/WordlistLoader.java =================================================================== --- src/java/org/apache/lucene/analysis/WordlistLoader.java (revision 834007) +++ src/java/org/apache/lucene/analysis/WordlistLoader.java (working copy) @@ -21,16 +21,70 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.Reader; import java.util.HashMap; import java.util.HashSet; +import java.util.Set; /** * Loader for text files that represent a list of stopwords. */ public class WordlistLoader { - + /** + * Loads a text file associated with a given class (See + * {@link Class#getResourceAsStream(String)}) and adds every line as an entry + * to a {@link Set} (omitting leading and trailing whitespace). Every line of + * the file should contain only one word. The words need to be in lower-case if + * you make use of an Analyzer which uses LowerCaseFilter (like + * StandardAnalyzer). + * + * @param aClass + * a class that is associated with the given stopwordResource + * @param stopwordResource + * name of the resource file associated with the given class + * @return a {@link Set} with the file's words + */ + public static Set getWordSet(Class aClass, String stopwordResource) + throws IOException { + final Reader reader = new BufferedReader(new InputStreamReader(aClass + .getResourceAsStream(stopwordResource), "UTF-8")); + try { + return getWordSet(reader); + } finally { + reader.close(); + } + } + + /** + * Loads a text file associated with a given class (See + * {@link Class#getResourceAsStream(String)}) and adds every line as an entry + * to a {@link Set} (omitting leading and trailing whitespace). Every line of + * the file should contain only one word. The words need to be in lower-case if + * you make use of an Analyzer which uses LowerCaseFilter (like + * StandardAnalyzer). + * + * @param aClass + * a class that is associated with the given stopwordResource + * @param stopwordResource + * name of the resource file associated with the given class + * @param comment + * the comment string to ignore + * @return a {@link Set} with the file's words + */ + public static Set getWordSet(Class aClass, + String stopwordResource, String comment) throws IOException { + final Reader reader = new BufferedReader(new InputStreamReader(aClass + .getResourceAsStream(stopwordResource), "UTF-8")); + try { + return getWordSet(reader, comment); + } finally { + reader.close(); + } + } + + /** * Loads a text file and adds every line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the file should contain only * one word. The words need to be in lowercase if you make use of an @@ -40,17 +94,15 @@ * @return A HashSet with the file's words */ public static HashSet getWordSet(File wordfile) throws IOException { - HashSet result = new HashSet(); FileReader reader = null; try { reader = new FileReader(wordfile); - result = getWordSet(reader); + return getWordSet(reader); } finally { if (reader != null) reader.close(); } - return result; } /** @@ -64,17 +116,15 @@ * @return A HashSet with the file's words */ public static HashSet getWordSet(File wordfile, String comment) throws IOException { - HashSet result = new HashSet(); FileReader reader = null; try { reader = new FileReader(wordfile); - result = getWordSet(reader, comment); + return getWordSet(reader, comment); } finally { if (reader != null) reader.close(); } - return result; } @@ -88,7 +138,7 @@ * @return A HashSet with the reader's words */ public static HashSet getWordSet(Reader reader) throws IOException { - HashSet result = new HashSet(); + final HashSet result = new HashSet(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { @@ -119,7 +169,7 @@ * @return A HashSet with the reader's words */ public static HashSet getWordSet(Reader reader, String comment) throws IOException { - HashSet result = new HashSet(); + final HashSet result = new HashSet(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { @@ -154,21 +204,18 @@ public static HashMap getStemDict(File wordstemfile) throws IOException { if (wordstemfile == null) throw new NullPointerException("wordstemfile may not be null"); - HashMap result = new HashMap(); + final HashMap result = new HashMap(); BufferedReader br = null; - FileReader fr = null; + try { - fr = new FileReader(wordstemfile); - br = new BufferedReader(fr); + br = new BufferedReader(new FileReader(wordstemfile)); String line; while ((line = br.readLine()) != null) { String[] wordstem = line.split("\t", 2); result.put(wordstem[0], wordstem[1]); } } finally { - if (fr != null) - fr.close(); - if (br != null) + if(br != null) br.close(); } return result; Index: src/test/org/apache/lucene/index/TestWordlistLoader.java =================================================================== --- src/test/org/apache/lucene/index/TestWordlistLoader.java (revision 834007) +++ src/test/org/apache/lucene/index/TestWordlistLoader.java (working copy) @@ -18,34 +18,56 @@ */ import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.HashSet; +import java.util.Set; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.WordlistLoader; public class TestWordlistLoader extends LuceneTestCase { + + public void testWordlistLoadingByResource() throws IOException { + + Set wordSet1 = WordlistLoader.getWordSet( + TestWordlistLoader.class, "wordliststopwords.txt","#"); + checkSet(wordSet1); + Set wordSet2 = WordlistLoader.getWordSet( + TestWordlistLoader.class, "wordliststopwords_nocomment.txt"); + checkSet(wordSet2); + } + + public void testWordlistFileLoading() throws IOException { + + Set wordSet1 = WordlistLoader.getWordSet(new File( + this.getClass().getResource("wordliststopwords_nocomment.txt") + .getFile())); + checkSet(wordSet1); + } public void testWordlistLoading() throws IOException { String s = "ONE\n two \nthree"; - HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s)); + HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s)); checkSet(wordSet1); - HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s))); + HashSet wordSet2 = WordlistLoader.getWordSet( + new BufferedReader(new StringReader(s))); checkSet(wordSet2); } public void testComments() throws Exception { String s = "ONE\n two \nthree\n#comment"; - HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#"); + HashSet wordSet1 = WordlistLoader.getWordSet( + new StringReader(s), "#"); checkSet(wordSet1); assertFalse(wordSet1.contains("#comment")); assertFalse(wordSet1.contains("comment")); } - private void checkSet(HashSet wordset) { + private void checkSet(Set wordset) { assertEquals(3, wordset.size()); assertTrue(wordset.contains("ONE")); // case is not modified assertTrue(wordset.contains("two")); // surrounding whitespace is removed Index: src/test/org/apache/lucene/index/wordliststopwords.txt =================================================================== --- src/test/org/apache/lucene/index/wordliststopwords.txt (revision 0) +++ src/test/org/apache/lucene/index/wordliststopwords.txt (revision 0) @@ -0,0 +1,5 @@ +#comment +ONE +two +#comment +three Index: src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt =================================================================== --- src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt (revision 0) +++ src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt (revision 0) @@ -0,0 +1,3 @@ +ONE +two +three