Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
===================================================================
--- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision 1124242)
+++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision )
@@ -19,6 +19,8 @@
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -62,7 +64,6 @@
}
writer.close();
reader = IndexReader.open(dir, true);
- protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer);
}
@Override
@@ -81,72 +82,61 @@
return hits;
}
- public void testUninitializedAnalyzer() throws Exception {
- //Note: no calls to "addStopWord"
+ public void testNoStopwords() throws Exception {
+ // Note: an empty list of fields passed in
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.EMPTY_LIST, 1);
String query = "variedField:quick repetitiveField:boring";
int numHits1 = search(protectedAnalyzer, query);
int numHits2 = search(appAnalyzer, query);
assertEquals("No filtering test", numHits1, numHits2);
}
- /*
- * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)'
- */
- public void testDefaultAddStopWordsIndexReader() throws Exception {
- protectedAnalyzer.addStopWords(reader);
+ public void testDefaultStopwordsAllFields() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader);
int numHits = search(protectedAnalyzer, "repetitiveField:boring");
assertEquals("Default filter should remove all docs", 0, numHits);
}
-
- /*
- * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)'
- */
- public void testAddStopWordsIndexReaderInt() throws Exception {
- protectedAnalyzer.addStopWords(reader, 1f / 2f);
+ public void testStopwordsAllFieldsMaxPercentDocs() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 2f);
int numHits = search(protectedAnalyzer, "repetitiveField:boring");
assertEquals("A filter on terms in > one half of docs remove boring docs", 0, numHits);
numHits = search(protectedAnalyzer, "repetitiveField:vaguelyboring");
assertTrue("A filter on terms in > half of docs should not remove vaguelyBoring docs", numHits > 1);
- protectedAnalyzer.addStopWords(reader, 1f / 4f);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 4f);
numHits = search(protectedAnalyzer, "repetitiveField:vaguelyboring");
assertEquals("A filter on terms in > quarter of docs should remove vaguelyBoring docs", 0, numHits);
}
-
- public void testAddStopWordsIndexReaderStringFloat() throws Exception {
- protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f);
+ public void testStopwordsPerFieldMaxPercentDocs() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField"), 1f / 2f);
int numHits = search(protectedAnalyzer, "repetitiveField:boring");
assertTrue("A filter on one Field should not affect queris on another", numHits > 0);
- protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField", "repetitiveField"), 1f / 2f);
numHits = search(protectedAnalyzer, "repetitiveField:boring");
assertEquals("A filter on the right Field should affect queries on it", numHits, 0);
}
- public void testAddStopWordsIndexReaderStringInt() throws Exception {
- int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
+ public void testStopwordsPerFieldMaxDocFreq() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10);
+ int numStopWords = protectedAnalyzer.getStopWords("repetitiveField").length;
assertTrue("Should have identified stop words", numStopWords > 0);
- Term[] t = protectedAnalyzer.getStopWords();
- assertEquals("num terms should = num stopwords returned", t.length, numStopWords);
-
- int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10);
- assertTrue("Should have identified more stop words", numNewStopWords > 0);
- t = protectedAnalyzer.getStopWords();
- assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField", "variedField"), 10);
+ int numNewStopWords = protectedAnalyzer.getStopWords("repetitiveField").length + protectedAnalyzer.getStopWords("variedField").length;
+ assertTrue("Should have identified more stop words", numNewStopWords > numStopWords);
}
public void testNoFieldNamePollution() throws Exception {
- protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10);
int numHits = search(protectedAnalyzer, "repetitiveField:boring");
assertEquals("Check filter set up OK", 0, numHits);
numHits = search(protectedAnalyzer, "variedField:boring");
assertTrue("Filter should not prevent stopwords in one field being used in another ", numHits > 0);
-
}
/*
@@ -165,8 +155,7 @@
}
public void testWrappingNonReusableAnalyzer() throws Exception {
- QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new NonreusableAnalyzer());
- a.addStopWords(reader, 10);
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new NonreusableAnalyzer(), reader, 10);
int numHits = search(a, "repetitiveField:boring");
assertTrue(numHits == 0);
numHits = search(a, "repetitiveField:vaguelyboring");
@@ -174,8 +163,9 @@
}
public void testTokenStream() throws Exception {
- QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
- a.addStopWords(reader, 10);
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(
+ TEST_VERSION_CURRENT,
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), reader, 10);
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
assertTokenStreamContents(ts, new String[] { "this" });
}
Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
===================================================================
--- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision 1124242)
+++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision )
@@ -44,8 +44,9 @@
*
*/
public final class QueryAutoStopWordAnalyzer extends Analyzer {
- Analyzer delegate;
- HashMap> stopWordsPerField = new HashMap>();
+
+ private final Analyzer delegate;
+ private final Map> stopWordsPerField = new HashMap>();
//The default maximum percentage (40%) of index documents which
//can contain a term, after which the term is considered to be a stop word.
public static final float defaultMaxDocFreqPercent = 0.4f;
@@ -55,20 +56,145 @@
* Initializes this analyzer with the Analyzer object that actually produces the tokens
*
* @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
+ * @deprecated Stopwords should be calculated at instantiation using one of the other constructors
*/
+ @Deprecated
public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
this.delegate = delegate;
this.matchVersion = matchVersion;
}
- /**
+ /**
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
+ * indexed fields from terms with a document frequency percentage greater than
+ * {@link #defaultMaxDocFreqPercent}
+ *
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @throws IOException Can be thrown while reading from the IndexReader
+ */
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader) throws IOException {
+ this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
+ }
+
+ /**
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
+ * indexed fields from terms with a document frequency greater than the given
+ * maxDocFreq
+ *
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param maxDocFreq Document frequency terms should be above in order to be stopwords
+ * @throws IOException Can be thrown while reading from the IndexReader
+ */
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ int maxDocFreq) throws IOException {
+ this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq);
+ }
+
+ /**
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
+ * indexed fields from terms with a document frequency percentage greater than
+ * the given maxPercentDocs
+ *
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
+ * contain a term, after which the word is considered to be a stop word
+ * @throws IOException Can be thrown while reading from the IndexReader
+ */
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ float maxPercentDocs) throws IOException {
+ this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs);
+ }
+
+ /**
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
+ * given selection of fields from terms with a document frequency percentage
+ * greater than the given maxPercentDocs
+ *
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param fields Selection of fields to calculate stopwords for
+ * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
+ * contain a term, after which the word is considered to be a stop word
+ * @throws IOException Can be thrown while reading from the IndexReader
+ */
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ Collection fields,
+ float maxPercentDocs) throws IOException {
+ this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
+ }
+
+ /**
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
+ * given selection of fields from terms with a document frequency greater than
+ * the given maxDocFreq
+ *
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param fields Selection of fields to calculate stopwords for
+ * @param maxDocFreq Document frequency terms should be above in order to be stopwords
+ * @throws IOException Can be thrown while reading from the IndexReader
+ */
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ Collection fields,
+ int maxDocFreq) throws IOException {
+ this.matchVersion = matchVersion;
+ this.delegate = delegate;
+
+ for (String field : fields) {
+ Set stopWords = new HashSet();
+ String internedFieldName = StringHelper.intern(field);
+ TermEnum te = indexReader.terms(new Term(field));
+ Term term = te.term();
+ while (term != null) {
+ if (term.field() != internedFieldName) {
+ break;
+ }
+ if (te.docFreq() > maxDocFreq) {
+ stopWords.add(term.text());
+ }
+ if (!te.next()) {
+ break;
+ }
+ term = te.term();
+ }
+ stopWordsPerField.put(field, stopWords);
+ }
+ }
+
+ /**
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
*
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
* exceed the required document frequency
* @return The number of stop words identified.
* @throws IOException
+ * @deprecated Stopwords should be calculated at instantiation using
+ * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader)}
*/
+ @Deprecated
public int addStopWords(IndexReader reader) throws IOException {
return addStopWords(reader, defaultMaxDocFreqPercent);
}
@@ -82,7 +208,10 @@
* the term is considered to be a stop word
* @return The number of stop words identified.
* @throws IOException
+ * @deprecated Stopwords should be calculated at instantiation using
+ * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, int)}
*/
+ @Deprecated
public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
int numStopWords = 0;
Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
@@ -102,7 +231,10 @@
* contain a term, after which the word is considered to be a stop word.
* @return The number of stop words identified.
* @throws IOException
+ * @deprecated Stowords should be calculated at instantiation using
+ * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, float)}
*/
+ @Deprecated
public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
int numStopWords = 0;
Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
@@ -123,7 +255,10 @@
* contain a term, after which the word is considered to be a stop word.
* @return The number of stop words identified.
* @throws IOException
+ * @deprecated Stowords should be calculated at instantiation using
+ * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, float)}
*/
+ @Deprecated
public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
}
@@ -138,7 +273,10 @@
* can contain a term, after which the term is considered to be a stop word.
* @return The number of stop words identified.
* @throws IOException
+ * @deprecated Stowords should be calculated at instantiation using
+ * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, int)}
*/
+ @Deprecated
public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
HashSet stopWords = new HashSet();
String internedFieldName = StringHelper.intern(fieldName);
@@ -177,7 +315,7 @@
} catch (IOException e) {
result = delegate.tokenStream(fieldName, reader);
}
- HashSet stopWords = stopWordsPerField.get(fieldName);
+ Set stopWords = stopWordsPerField.get(fieldName);
if (stopWords != null) {
result = new StopFilter(matchVersion, result, stopWords);
}
@@ -194,12 +332,12 @@
*/
TokenStream withStopFilter;
}
-
+
+ @SuppressWarnings("unchecked")
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
/* map of SavedStreams for each field */
- @SuppressWarnings("unchecked")
Map streamMap = (Map) getPreviousTokenStream();
if (streamMap == null) {
streamMap = new HashMap();
@@ -214,11 +352,12 @@
streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
/* if there are any stopwords for the field, save the stopfilter */
- HashSet stopWords = stopWordsPerField.get(fieldName);
- if (stopWords != null)
+ Set stopWords = stopWordsPerField.get(fieldName);
+ if (stopWords != null) {
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
- else
+ } else {
streams.withStopFilter = streams.wrapped;
+ }
} else {
/*
@@ -234,13 +373,14 @@
* field, create a new StopFilter around the new stream
*/
streams.wrapped = result;
- HashSet stopWords = stopWordsPerField.get(fieldName);
- if (stopWords != null)
+ Set stopWords = stopWordsPerField.get(fieldName);
+ if (stopWords != null) {
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
- else
+ } else {
streams.withStopFilter = streams.wrapped;
- }
- }
+ }
+ }
+ }
return streams.withStopFilter;
}
@@ -253,15 +393,9 @@
* @return the stop words identified for a field
*/
public String[] getStopWords(String fieldName) {
- String[] result;
- HashSet stopWords = stopWordsPerField.get(fieldName);
- if (stopWords != null) {
- result = stopWords.toArray(new String[stopWords.size()]);
- } else {
- result = new String[0];
+ Set stopWords = stopWordsPerField.get(fieldName);
+ return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0];
- }
+ }
- return result;
- }
/**
* Provides information on which stop words have been identified for all fields
@@ -269,12 +403,10 @@
* @return the stop words (as terms)
*/
public Term[] getStopWords() {
- ArrayList allStopWords = new ArrayList();
- for (Iterator iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) {
- String fieldName = iter.next();
- HashSet stopWords = stopWordsPerField.get(fieldName);
- for (Iterator iterator = stopWords.iterator(); iterator.hasNext();) {
- String text = iterator.next();
+ List allStopWords = new ArrayList();
+ for (String fieldName : stopWordsPerField.keySet()) {
+ Set stopWords = stopWordsPerField.get(fieldName);
+ for (String text : stopWords) {
allStopWords.add(new Term(fieldName, text));
}
}