Index: modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision 1127326) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision ) @@ -41,132 +41,131 @@ * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for * this term to take 2 seconds. *

- *

- * Use the various "addStopWords" methods in this class to automate the identification and addition of - * stop words found in an already existing index. - *

*/ public final class QueryAutoStopWordAnalyzer extends Analyzer { - Analyzer delegate; - HashMap> stopWordsPerField = new HashMap>(); + + private final Analyzer delegate; + private final Map> stopWordsPerField = new HashMap>(); //The default maximum percentage (40%) of index documents which //can contain a term, after which the term is considered to be a stop word. public static final float defaultMaxDocFreqPercent = 0.4f; private final Version matchVersion; /** - * Initializes this analyzer with the Analyzer object that actually produces the tokens + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all + * indexed fields from terms with a document frequency percentage greater than + * {@link #defaultMaxDocFreqPercent} * - * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @throws IOException Can be thrown while reading from the IndexReader */ - public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) { - this.delegate = delegate; - this.matchVersion = matchVersion; + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader) throws IOException { + this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent); } /** - * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all + * indexed fields from terms with a document frequency greater than the given + * maxDocFreq * - * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that - * exceed the required document frequency - * @return The number of stop words identified. - * @throws IOException + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param maxDocFreq Document frequency terms should be above in order to be stopwords + * @throws IOException Can be thrown while reading from the IndexReader */ - public int addStopWords(IndexReader reader) throws IOException { - return addStopWords(reader, defaultMaxDocFreqPercent); + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + int maxDocFreq) throws IOException { + this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq); } /** - * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all + * indexed fields from terms with a document frequency percentage greater than + * the given maxPercentDocs * - * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that - * exceed the required document frequency - * @param maxDocFreq The maximum number of index documents which can contain a term, after which - * the term is considered to be a stop word - * @return The number of stop words identified. - * @throws IOException - */ - public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException { - int numStopWords = 0; - Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED); - for (Iterator iter = fieldNames.iterator(); iter.hasNext();) { - String fieldName = iter.next(); - numStopWords += addStopWords(reader, fieldName, maxDocFreq); - } - return numStopWords; - } - - /** - * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent - * - * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that - * exceed the required document frequency + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which - * contain a term, after which the word is considered to be a stop word. - * @return The number of stop words identified. - * @throws IOException + * contain a term, after which the word is considered to be a stop word + * @throws IOException Can be thrown while reading from the IndexReader */ - public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException { - int numStopWords = 0; - Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED); - for (Iterator iter = fieldNames.iterator(); iter.hasNext();) { - String fieldName = iter.next(); - numStopWords += addStopWords(reader, fieldName, maxPercentDocs); + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + float maxPercentDocs) throws IOException { + this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs); - } + } - return numStopWords; - } /** - * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the + * given selection of fields from terms with a document frequency percentage + * greater than the given maxPercentDocs * - * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that - * exceed the required document frequency - * @param fieldName The field for which stopwords will be added + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param fields Selection of fields to calculate stopwords for * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which - * contain a term, after which the word is considered to be a stop word. - * @return The number of stop words identified. - * @throws IOException + * contain a term, after which the word is considered to be a stop word + * @throws IOException Can be thrown while reading from the IndexReader */ - public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException { - return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs)); + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + Collection fields, + float maxPercentDocs) throws IOException { + this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs)); } /** - * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the + * given selection of fields from terms with a document frequency greater than + * the given maxDocFreq * - * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that - * exceed the required document frequency - * @param fieldName The field for which stopwords will be added - * @param maxDocFreq The maximum number of index documents which - * can contain a term, after which the term is considered to be a stop word. - * @return The number of stop words identified. - * @throws IOException + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param fields Selection of fields to calculate stopwords for + * @param maxDocFreq Document frequency terms should be above in order to be stopwords + * @throws IOException Can be thrown while reading from the IndexReader */ - public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException { - HashSet stopWords = new HashSet(); - final Terms terms = MultiFields.getTerms(reader, fieldName); - final CharsRef spare = new CharsRef(); + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + Collection fields, + int maxDocFreq) throws IOException { + this.matchVersion = matchVersion; + this.delegate = delegate; + + for (String field : fields) { + Set stopWords = new HashSet(); + Terms terms = MultiFields.getTerms(indexReader, field); + CharsRef spare = new CharsRef(); - if (terms != null) { + if (terms != null) { - final TermsEnum te = terms.iterator(); + TermsEnum te = terms.iterator(); - BytesRef text; - while ((text = te.next()) != null) { - if (te.docFreq() > maxDocFreq) { - stopWords.add(text.utf8ToChars(spare).toString()); - } - } - } + BytesRef text; + while ((text = te.next()) != null) { + if (te.docFreq() > maxDocFreq) { + stopWords.add(text.utf8ToChars(spare).toString()); + } + } + } - stopWordsPerField.put(fieldName, stopWords); - - /* if the stopwords for a field are changed, - * then saved streams for that field are erased. - */ - @SuppressWarnings("unchecked") - Map streamMap = (Map) getPreviousTokenStream(); - if (streamMap != null) - streamMap.remove(fieldName); - - return stopWords.size(); + stopWordsPerField.put(field, stopWords); - } + } + } @Override public TokenStream tokenStream(String fieldName, Reader reader) { @@ -176,7 +175,7 @@ } catch (IOException e) { result = delegate.tokenStream(fieldName, reader); } - HashSet stopWords = stopWordsPerField.get(fieldName); + Set stopWords = stopWordsPerField.get(fieldName); if (stopWords != null) { result = new StopFilter(matchVersion, result, stopWords); } @@ -193,12 +192,11 @@ */ TokenStream withStopFilter; } - + + @SuppressWarnings("unchecked") @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { /* map of SavedStreams for each field */ - @SuppressWarnings("unchecked") Map streamMap = (Map) getPreviousTokenStream(); if (streamMap == null) { streamMap = new HashMap(); @@ -213,33 +211,34 @@ streams.wrapped = delegate.reusableTokenStream(fieldName, reader); /* if there are any stopwords for the field, save the stopfilter */ - HashSet stopWords = stopWordsPerField.get(fieldName); - if (stopWords != null) + Set stopWords = stopWordsPerField.get(fieldName); + if (stopWords != null) { streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords); - else + } else { streams.withStopFilter = streams.wrapped; - + } } else { /* - * an entry for this field exists, verify the wrapped stream has not - * changed. if it has not, reuse it, otherwise wrap the new stream. - */ + * an entry for this field exists, verify the wrapped stream has not + * changed. if it has not, reuse it, otherwise wrap the new stream. + */ TokenStream result = delegate.reusableTokenStream(fieldName, reader); if (result == streams.wrapped) { /* the wrapped analyzer reused the stream */ } else { /* - * the wrapped analyzer did not. if there are any stopwords for the - * field, create a new StopFilter around the new stream - */ + * the wrapped analyzer did not. if there are any stopwords for the + * field, create a new StopFilter around the new stream + */ streams.wrapped = result; - HashSet stopWords = stopWordsPerField.get(fieldName); - if (stopWords != null) + Set stopWords = stopWordsPerField.get(fieldName); + if (stopWords != null) { streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords); - else + } else { streams.withStopFilter = streams.wrapped; - } - } + } + } + } return streams.withStopFilter; } @@ -252,15 +251,9 @@ * @return the stop words identified for a field */ public String[] getStopWords(String fieldName) { - String[] result; - HashSet stopWords = stopWordsPerField.get(fieldName); - if (stopWords != null) { - result = stopWords.toArray(new String[stopWords.size()]); - } else { - result = new String[0]; + Set stopWords = stopWordsPerField.get(fieldName); + return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0]; - } + } - return result; - } /** * Provides information on which stop words have been identified for all fields @@ -268,12 +261,10 @@ * @return the stop words (as terms) */ public Term[] getStopWords() { - ArrayList allStopWords = new ArrayList(); - for (Iterator iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) { - String fieldName = iter.next(); - HashSet stopWords = stopWordsPerField.get(fieldName); - for (Iterator iterator = stopWords.iterator(); iterator.hasNext();) { - String text = iterator.next(); + List allStopWords = new ArrayList(); + for (String fieldName : stopWordsPerField.keySet()) { + Set stopWords = stopWordsPerField.get(fieldName); + for (String text : stopWords) { allStopWords.add(new Term(fieldName, text)); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision 1169607) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision ) @@ -16,23 +16,19 @@ * limitations under the License. */ -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.*; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; +import java.io.StringReader; +import java.util.Arrays; +import java.util.Collections; + public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase { String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"}; String repetitiveFieldValues[] = {"boring", "boring", "vaguelyboring"}; @@ -58,7 +54,6 @@ } writer.close(); reader = IndexReader.open(dir, true); - protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer); } @Override @@ -67,9 +62,9 @@ super.tearDown(); } - public void testUninitializedAnalyzer() throws Exception { - // Note: no calls to "addStopWord" - // query = "variedField:quick repetitiveField:boring"; + public void testNoStopwords() throws Exception { + // Note: an empty list of fields passed in + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.EMPTY_LIST, 1); TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("variedField", new StringReader("quick")); assertTokenStreamContents(protectedTokenStream, new String[]{"quick"}); @@ -77,21 +72,14 @@ assertTokenStreamContents(protectedTokenStream, new String[]{"boring"}); } - /* - * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)' - */ - public void testDefaultAddStopWordsIndexReader() throws Exception { - protectedAnalyzer.addStopWords(reader); + public void testDefaultStopwordsAllFields() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader); TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); - assertTokenStreamContents(protectedTokenStream, new String[0]); // Default stop word filtering will remove boring } - /* - * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)' - */ - public void testAddStopWordsIndexReaderInt() throws Exception { - protectedAnalyzer.addStopWords(reader, 1f / 2f); + public void testStopwordsAllFieldsMaxPercentDocs() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 2f); TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); // A filter on terms in > one half of docs remove boring @@ -101,39 +89,36 @@ // A filter on terms in > half of docs should not remove vaguelyBoring assertTokenStreamContents(protectedTokenStream, new String[]{"vaguelyboring"}); - protectedAnalyzer.addStopWords(reader, 1f / 4f); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 4f); protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("vaguelyboring")); // A filter on terms in > quarter of docs should remove vaguelyBoring assertTokenStreamContents(protectedTokenStream, new String[0]); } - public void testAddStopWordsIndexReaderStringFloat() throws Exception { - protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f); + public void testStopwordsPerFieldMaxPercentDocs() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField"), 1f / 2f); TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); // A filter on one Field should not affect queries on another assertTokenStreamContents(protectedTokenStream, new String[]{"boring"}); - protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField", "repetitiveField"), 1f / 2f); protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); // A filter on the right Field should affect queries on it assertTokenStreamContents(protectedTokenStream, new String[0]); } - public void testAddStopWordsIndexReaderStringInt() throws Exception { - int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); + public void testStopwordsPerFieldMaxDocFreq() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10); + int numStopWords = protectedAnalyzer.getStopWords("repetitiveField").length; assertTrue("Should have identified stop words", numStopWords > 0); - Term[] t = protectedAnalyzer.getStopWords(); - assertEquals("num terms should = num stopwords returned", t.length, numStopWords); - - int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10); - assertTrue("Should have identified more stop words", numNewStopWords > 0); - t = protectedAnalyzer.getStopWords(); - assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField", "variedField"), 10); + int numNewStopWords = protectedAnalyzer.getStopWords("repetitiveField").length + protectedAnalyzer.getStopWords("variedField").length; + assertTrue("Should have identified more stop words", numNewStopWords > numStopWords); } public void testNoFieldNamePollution() throws Exception { - protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10); TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); // Check filter set up OK @@ -145,8 +130,9 @@ } public void testTokenStream() throws Exception { - QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - a.addStopWords(reader, 10); + QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer( + TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), reader, 10); TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring")); assertTokenStreamContents(ts, new String[] { "this" }); }