Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision 1124242) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision ) @@ -19,6 +19,8 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; +import java.util.Collections; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -62,7 +64,6 @@ } writer.close(); reader = IndexReader.open(dir, true); - protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer); } @Override @@ -81,72 +82,61 @@ return hits; } - public void testUninitializedAnalyzer() throws Exception { - //Note: no calls to "addStopWord" + public void testNoStopwords() throws Exception { + // Note: an empty list of fields passed in + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.EMPTY_LIST, 1); String query = "variedField:quick repetitiveField:boring"; int numHits1 = search(protectedAnalyzer, query); int numHits2 = search(appAnalyzer, query); assertEquals("No filtering test", numHits1, numHits2); } - /* - * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)' - */ - public void testDefaultAddStopWordsIndexReader() throws Exception { - protectedAnalyzer.addStopWords(reader); + public void testDefaultStopwordsAllFields() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader); int numHits = search(protectedAnalyzer, "repetitiveField:boring"); assertEquals("Default filter should remove all docs", 0, numHits); } - - /* - * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)' - */ - public void testAddStopWordsIndexReaderInt() throws Exception { - protectedAnalyzer.addStopWords(reader, 1f / 2f); + public void testStopwordsAllFieldsMaxPercentDocs() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 2f); int numHits = search(protectedAnalyzer, "repetitiveField:boring"); assertEquals("A filter on terms in > one half of docs remove boring docs", 0, numHits); numHits = search(protectedAnalyzer, "repetitiveField:vaguelyboring"); assertTrue("A filter on terms in > half of docs should not remove vaguelyBoring docs", numHits > 1); - protectedAnalyzer.addStopWords(reader, 1f / 4f); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 4f); numHits = search(protectedAnalyzer, "repetitiveField:vaguelyboring"); assertEquals("A filter on terms in > quarter of docs should remove vaguelyBoring docs", 0, numHits); } - - public void testAddStopWordsIndexReaderStringFloat() throws Exception { - protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f); + public void testStopwordsPerFieldMaxPercentDocs() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField"), 1f / 2f); int numHits = search(protectedAnalyzer, "repetitiveField:boring"); assertTrue("A filter on one Field should not affect queris on another", numHits > 0); - protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField", "repetitiveField"), 1f / 2f); numHits = search(protectedAnalyzer, "repetitiveField:boring"); assertEquals("A filter on the right Field should affect queries on it", numHits, 0); } - public void testAddStopWordsIndexReaderStringInt() throws Exception { - int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); + public void testStopwordsPerFieldMaxDocFreq() throws Exception { + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10); + int numStopWords = protectedAnalyzer.getStopWords("repetitiveField").length; assertTrue("Should have identified stop words", numStopWords > 0); - Term[] t = protectedAnalyzer.getStopWords(); - assertEquals("num terms should = num stopwords returned", t.length, numStopWords); - - int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10); - assertTrue("Should have identified more stop words", numNewStopWords > 0); - t = protectedAnalyzer.getStopWords(); - assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField", "variedField"), 10); + int numNewStopWords = protectedAnalyzer.getStopWords("repetitiveField").length + protectedAnalyzer.getStopWords("variedField").length; + assertTrue("Should have identified more stop words", numNewStopWords > numStopWords); } public void testNoFieldNamePollution() throws Exception { - protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10); int numHits = search(protectedAnalyzer, "repetitiveField:boring"); assertEquals("Check filter set up OK", 0, numHits); numHits = search(protectedAnalyzer, "variedField:boring"); assertTrue("Filter should not prevent stopwords in one field being used in another ", numHits > 0); - } /* @@ -165,8 +155,7 @@ } public void testWrappingNonReusableAnalyzer() throws Exception { - QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new NonreusableAnalyzer()); - a.addStopWords(reader, 10); + QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new NonreusableAnalyzer(), reader, 10); int numHits = search(a, "repetitiveField:boring"); assertTrue(numHits == 0); numHits = search(a, "repetitiveField:vaguelyboring"); @@ -174,8 +163,9 @@ } public void testTokenStream() throws Exception { - QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - a.addStopWords(reader, 10); + QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer( + TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), reader, 10); TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring")); assertTokenStreamContents(ts, new String[] { "this" }); } Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision 1124242) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision ) @@ -44,8 +44,9 @@ *

*/ public final class QueryAutoStopWordAnalyzer extends Analyzer { - Analyzer delegate; - HashMap> stopWordsPerField = new HashMap>(); + + private final Analyzer delegate; + private final Map> stopWordsPerField = new HashMap>(); //The default maximum percentage (40%) of index documents which //can contain a term, after which the term is considered to be a stop word. public static final float defaultMaxDocFreqPercent = 0.4f; @@ -55,20 +56,145 @@ * Initializes this analyzer with the Analyzer object that actually produces the tokens * * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering + * @deprecated Stopwords should be calculated at instantiation using one of the other constructors */ + @Deprecated public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) { this.delegate = delegate; this.matchVersion = matchVersion; } - /** + /** + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all + * indexed fields from terms with a document frequency percentage greater than + * {@link #defaultMaxDocFreqPercent} + * + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @throws IOException Can be thrown while reading from the IndexReader + */ + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader) throws IOException { + this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent); + } + + /** + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all + * indexed fields from terms with a document frequency greater than the given + * maxDocFreq + * + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param maxDocFreq Document frequency terms should be above in order to be stopwords + * @throws IOException Can be thrown while reading from the IndexReader + */ + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + int maxDocFreq) throws IOException { + this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq); + } + + /** + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all + * indexed fields from terms with a document frequency percentage greater than + * the given maxPercentDocs + * + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which + * contain a term, after which the word is considered to be a stop word + * @throws IOException Can be thrown while reading from the IndexReader + */ + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + float maxPercentDocs) throws IOException { + this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs); + } + + /** + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the + * given selection of fields from terms with a document frequency percentage + * greater than the given maxPercentDocs + * + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param fields Selection of fields to calculate stopwords for + * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which + * contain a term, after which the word is considered to be a stop word + * @throws IOException Can be thrown while reading from the IndexReader + */ + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + Collection fields, + float maxPercentDocs) throws IOException { + this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs)); + } + + /** + * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the + * given selection of fields from terms with a document frequency greater than + * the given maxDocFreq + * + * @param matchVersion Version to be used in {@link StopFilter} + * @param delegate Analyzer whose TokenStream will be filtered + * @param indexReader IndexReader to identify the stopwords from + * @param fields Selection of fields to calculate stopwords for + * @param maxDocFreq Document frequency terms should be above in order to be stopwords + * @throws IOException Can be thrown while reading from the IndexReader + */ + public QueryAutoStopWordAnalyzer( + Version matchVersion, + Analyzer delegate, + IndexReader indexReader, + Collection fields, + int maxDocFreq) throws IOException { + this.matchVersion = matchVersion; + this.delegate = delegate; + + for (String field : fields) { + Set stopWords = new HashSet(); + String internedFieldName = StringHelper.intern(field); + TermEnum te = indexReader.terms(new Term(field)); + Term term = te.term(); + while (term != null) { + if (term.field() != internedFieldName) { + break; + } + if (te.docFreq() > maxDocFreq) { + stopWords.add(term.text()); + } + if (!te.next()) { + break; + } + term = te.term(); + } + stopWordsPerField.put(field, stopWords); + } + } + + /** * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @return The number of stop words identified. * @throws IOException + * @deprecated Stopwords should be calculated at instantiation using + * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader)} */ + @Deprecated public int addStopWords(IndexReader reader) throws IOException { return addStopWords(reader, defaultMaxDocFreqPercent); } @@ -82,7 +208,10 @@ * the term is considered to be a stop word * @return The number of stop words identified. * @throws IOException + * @deprecated Stopwords should be calculated at instantiation using + * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, int)} */ + @Deprecated public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException { int numStopWords = 0; Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED); @@ -102,7 +231,10 @@ * contain a term, after which the word is considered to be a stop word. * @return The number of stop words identified. * @throws IOException + * @deprecated Stowords should be calculated at instantiation using + * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, float)} */ + @Deprecated public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException { int numStopWords = 0; Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED); @@ -123,7 +255,10 @@ * contain a term, after which the word is considered to be a stop word. * @return The number of stop words identified. * @throws IOException + * @deprecated Stowords should be calculated at instantiation using + * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, float)} */ + @Deprecated public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException { return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs)); } @@ -138,7 +273,10 @@ * can contain a term, after which the term is considered to be a stop word. * @return The number of stop words identified. * @throws IOException + * @deprecated Stowords should be calculated at instantiation using + * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, int)} */ + @Deprecated public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException { HashSet stopWords = new HashSet(); String internedFieldName = StringHelper.intern(fieldName); @@ -177,7 +315,7 @@ } catch (IOException e) { result = delegate.tokenStream(fieldName, reader); } - HashSet stopWords = stopWordsPerField.get(fieldName); + Set stopWords = stopWordsPerField.get(fieldName); if (stopWords != null) { result = new StopFilter(matchVersion, result, stopWords); } @@ -194,12 +332,12 @@ */ TokenStream withStopFilter; } - + + @SuppressWarnings("unchecked") @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { /* map of SavedStreams for each field */ - @SuppressWarnings("unchecked") Map streamMap = (Map) getPreviousTokenStream(); if (streamMap == null) { streamMap = new HashMap(); @@ -214,11 +352,12 @@ streams.wrapped = delegate.reusableTokenStream(fieldName, reader); /* if there are any stopwords for the field, save the stopfilter */ - HashSet stopWords = stopWordsPerField.get(fieldName); - if (stopWords != null) + Set stopWords = stopWordsPerField.get(fieldName); + if (stopWords != null) { streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords); - else + } else { streams.withStopFilter = streams.wrapped; + } } else { /* @@ -234,13 +373,14 @@ * field, create a new StopFilter around the new stream */ streams.wrapped = result; - HashSet stopWords = stopWordsPerField.get(fieldName); - if (stopWords != null) + Set stopWords = stopWordsPerField.get(fieldName); + if (stopWords != null) { streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords); - else + } else { streams.withStopFilter = streams.wrapped; - } - } + } + } + } return streams.withStopFilter; } @@ -253,15 +393,9 @@ * @return the stop words identified for a field */ public String[] getStopWords(String fieldName) { - String[] result; - HashSet stopWords = stopWordsPerField.get(fieldName); - if (stopWords != null) { - result = stopWords.toArray(new String[stopWords.size()]); - } else { - result = new String[0]; + Set stopWords = stopWordsPerField.get(fieldName); + return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0]; - } + } - return result; - } /** * Provides information on which stop words have been identified for all fields @@ -269,12 +403,10 @@ * @return the stop words (as terms) */ public Term[] getStopWords() { - ArrayList allStopWords = new ArrayList(); - for (Iterator iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) { - String fieldName = iter.next(); - HashSet stopWords = stopWordsPerField.get(fieldName); - for (Iterator iterator = stopWords.iterator(); iterator.hasNext();) { - String text = iterator.next(); + List allStopWords = new ArrayList(); + for (String fieldName : stopWordsPerField.keySet()) { + Set stopWords = stopWordsPerField.get(fieldName); + for (String text : stopWords) { allStopWords.add(new Term(fieldName, text)); } }