Index: modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision 1127326)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision )
@@ -41,132 +41,131 @@
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
* this term to take 2 seconds.
*
- *
- * Use the various "addStopWords" methods in this class to automate the identification and addition of
- * stop words found in an already existing index.
- *
*/
public final class QueryAutoStopWordAnalyzer extends Analyzer {
- Analyzer delegate;
- HashMap> stopWordsPerField = new HashMap>();
+
+ private final Analyzer delegate;
+ private final Map> stopWordsPerField = new HashMap>();
//The default maximum percentage (40%) of index documents which
//can contain a term, after which the term is considered to be a stop word.
public static final float defaultMaxDocFreqPercent = 0.4f;
private final Version matchVersion;
/**
- * Initializes this analyzer with the Analyzer object that actually produces the tokens
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
+ * indexed fields from terms with a document frequency percentage greater than
+ * {@link #defaultMaxDocFreqPercent}
*
- * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @throws IOException Can be thrown while reading from the IndexReader
*/
- public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
- this.delegate = delegate;
- this.matchVersion = matchVersion;
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader) throws IOException {
+ this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
}
/**
- * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
+ * indexed fields from terms with a document frequency greater than the given
+ * maxDocFreq
*
- * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
- * exceed the required document frequency
- * @return The number of stop words identified.
- * @throws IOException
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param maxDocFreq Document frequency terms should be above in order to be stopwords
+ * @throws IOException Can be thrown while reading from the IndexReader
*/
- public int addStopWords(IndexReader reader) throws IOException {
- return addStopWords(reader, defaultMaxDocFreqPercent);
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ int maxDocFreq) throws IOException {
+ this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq);
}
/**
- * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
+ * indexed fields from terms with a document frequency percentage greater than
+ * the given maxPercentDocs
*
- * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
- * exceed the required document frequency
- * @param maxDocFreq The maximum number of index documents which can contain a term, after which
- * the term is considered to be a stop word
- * @return The number of stop words identified.
- * @throws IOException
- */
- public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
- int numStopWords = 0;
- Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
- for (Iterator iter = fieldNames.iterator(); iter.hasNext();) {
- String fieldName = iter.next();
- numStopWords += addStopWords(reader, fieldName, maxDocFreq);
- }
- return numStopWords;
- }
-
- /**
- * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
- *
- * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
- * exceed the required document frequency
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
- * contain a term, after which the word is considered to be a stop word.
- * @return The number of stop words identified.
- * @throws IOException
+ * contain a term, after which the word is considered to be a stop word
+ * @throws IOException Can be thrown while reading from the IndexReader
*/
- public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
- int numStopWords = 0;
- Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
- for (Iterator iter = fieldNames.iterator(); iter.hasNext();) {
- String fieldName = iter.next();
- numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ float maxPercentDocs) throws IOException {
+ this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs);
- }
+ }
- return numStopWords;
- }
/**
- * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
+ * given selection of fields from terms with a document frequency percentage
+ * greater than the given maxPercentDocs
*
- * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
- * exceed the required document frequency
- * @param fieldName The field for which stopwords will be added
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param fields Selection of fields to calculate stopwords for
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
- * contain a term, after which the word is considered to be a stop word.
- * @return The number of stop words identified.
- * @throws IOException
+ * contain a term, after which the word is considered to be a stop word
+ * @throws IOException Can be thrown while reading from the IndexReader
*/
- public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
- return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ Collection fields,
+ float maxPercentDocs) throws IOException {
+ this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
}
/**
- * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
+ * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
+ * given selection of fields from terms with a document frequency greater than
+ * the given maxDocFreq
*
- * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
- * exceed the required document frequency
- * @param fieldName The field for which stopwords will be added
- * @param maxDocFreq The maximum number of index documents which
- * can contain a term, after which the term is considered to be a stop word.
- * @return The number of stop words identified.
- * @throws IOException
+ * @param matchVersion Version to be used in {@link StopFilter}
+ * @param delegate Analyzer whose TokenStream will be filtered
+ * @param indexReader IndexReader to identify the stopwords from
+ * @param fields Selection of fields to calculate stopwords for
+ * @param maxDocFreq Document frequency terms should be above in order to be stopwords
+ * @throws IOException Can be thrown while reading from the IndexReader
*/
- public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
- HashSet stopWords = new HashSet();
- final Terms terms = MultiFields.getTerms(reader, fieldName);
- final CharsRef spare = new CharsRef();
+ public QueryAutoStopWordAnalyzer(
+ Version matchVersion,
+ Analyzer delegate,
+ IndexReader indexReader,
+ Collection fields,
+ int maxDocFreq) throws IOException {
+ this.matchVersion = matchVersion;
+ this.delegate = delegate;
+
+ for (String field : fields) {
+ Set stopWords = new HashSet();
+ Terms terms = MultiFields.getTerms(indexReader, field);
+ CharsRef spare = new CharsRef();
- if (terms != null) {
+ if (terms != null) {
- final TermsEnum te = terms.iterator();
+ TermsEnum te = terms.iterator();
- BytesRef text;
- while ((text = te.next()) != null) {
- if (te.docFreq() > maxDocFreq) {
- stopWords.add(text.utf8ToChars(spare).toString());
- }
- }
- }
+ BytesRef text;
+ while ((text = te.next()) != null) {
+ if (te.docFreq() > maxDocFreq) {
+ stopWords.add(text.utf8ToChars(spare).toString());
+ }
+ }
+ }
- stopWordsPerField.put(fieldName, stopWords);
-
- /* if the stopwords for a field are changed,
- * then saved streams for that field are erased.
- */
- @SuppressWarnings("unchecked")
- Map streamMap = (Map) getPreviousTokenStream();
- if (streamMap != null)
- streamMap.remove(fieldName);
-
- return stopWords.size();
+ stopWordsPerField.put(field, stopWords);
- }
+ }
+ }
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -176,7 +175,7 @@
} catch (IOException e) {
result = delegate.tokenStream(fieldName, reader);
}
- HashSet stopWords = stopWordsPerField.get(fieldName);
+ Set stopWords = stopWordsPerField.get(fieldName);
if (stopWords != null) {
result = new StopFilter(matchVersion, result, stopWords);
}
@@ -193,12 +192,11 @@
*/
TokenStream withStopFilter;
}
-
+
+ @SuppressWarnings("unchecked")
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
/* map of SavedStreams for each field */
- @SuppressWarnings("unchecked")
Map streamMap = (Map) getPreviousTokenStream();
if (streamMap == null) {
streamMap = new HashMap();
@@ -213,33 +211,34 @@
streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
/* if there are any stopwords for the field, save the stopfilter */
- HashSet stopWords = stopWordsPerField.get(fieldName);
- if (stopWords != null)
+ Set stopWords = stopWordsPerField.get(fieldName);
+ if (stopWords != null) {
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
- else
+ } else {
streams.withStopFilter = streams.wrapped;
-
+ }
} else {
/*
- * an entry for this field exists, verify the wrapped stream has not
- * changed. if it has not, reuse it, otherwise wrap the new stream.
- */
+ * an entry for this field exists, verify the wrapped stream has not
+ * changed. if it has not, reuse it, otherwise wrap the new stream.
+ */
TokenStream result = delegate.reusableTokenStream(fieldName, reader);
if (result == streams.wrapped) {
/* the wrapped analyzer reused the stream */
} else {
/*
- * the wrapped analyzer did not. if there are any stopwords for the
- * field, create a new StopFilter around the new stream
- */
+ * the wrapped analyzer did not. if there are any stopwords for the
+ * field, create a new StopFilter around the new stream
+ */
streams.wrapped = result;
- HashSet stopWords = stopWordsPerField.get(fieldName);
- if (stopWords != null)
+ Set stopWords = stopWordsPerField.get(fieldName);
+ if (stopWords != null) {
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
- else
+ } else {
streams.withStopFilter = streams.wrapped;
- }
- }
+ }
+ }
+ }
return streams.withStopFilter;
}
@@ -252,15 +251,9 @@
* @return the stop words identified for a field
*/
public String[] getStopWords(String fieldName) {
- String[] result;
- HashSet stopWords = stopWordsPerField.get(fieldName);
- if (stopWords != null) {
- result = stopWords.toArray(new String[stopWords.size()]);
- } else {
- result = new String[0];
+ Set stopWords = stopWordsPerField.get(fieldName);
+ return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0];
- }
+ }
- return result;
- }
/**
* Provides information on which stop words have been identified for all fields
@@ -268,12 +261,10 @@
* @return the stop words (as terms)
*/
public Term[] getStopWords() {
- ArrayList allStopWords = new ArrayList();
- for (Iterator iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) {
- String fieldName = iter.next();
- HashSet stopWords = stopWordsPerField.get(fieldName);
- for (Iterator iterator = stopWords.iterator(); iterator.hasNext();) {
- String text = iterator.next();
+ List allStopWords = new ArrayList();
+ for (String fieldName : stopWordsPerField.keySet()) {
+ Set stopWords = stopWordsPerField.get(fieldName);
+ for (String text : stopWords) {
allStopWords.add(new Term(fieldName, text));
}
}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision 1169607)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision )
@@ -16,23 +16,19 @@
* limitations under the License.
*/
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
+
public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
String repetitiveFieldValues[] = {"boring", "boring", "vaguelyboring"};
@@ -58,7 +54,6 @@
}
writer.close();
reader = IndexReader.open(dir, true);
- protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer);
}
@Override
@@ -67,9 +62,9 @@
super.tearDown();
}
- public void testUninitializedAnalyzer() throws Exception {
- // Note: no calls to "addStopWord"
- // query = "variedField:quick repetitiveField:boring";
+ public void testNoStopwords() throws Exception {
+ // Note: an empty list of fields passed in
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.EMPTY_LIST, 1);
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("variedField", new StringReader("quick"));
assertTokenStreamContents(protectedTokenStream, new String[]{"quick"});
@@ -77,21 +72,14 @@
assertTokenStreamContents(protectedTokenStream, new String[]{"boring"});
}
- /*
- * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)'
- */
- public void testDefaultAddStopWordsIndexReader() throws Exception {
- protectedAnalyzer.addStopWords(reader);
+ public void testDefaultStopwordsAllFields() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader);
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
-
assertTokenStreamContents(protectedTokenStream, new String[0]); // Default stop word filtering will remove boring
}
- /*
- * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)'
- */
- public void testAddStopWordsIndexReaderInt() throws Exception {
- protectedAnalyzer.addStopWords(reader, 1f / 2f);
+ public void testStopwordsAllFieldsMaxPercentDocs() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 2f);
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
// A filter on terms in > one half of docs remove boring
@@ -101,39 +89,36 @@
// A filter on terms in > half of docs should not remove vaguelyBoring
assertTokenStreamContents(protectedTokenStream, new String[]{"vaguelyboring"});
- protectedAnalyzer.addStopWords(reader, 1f / 4f);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 4f);
protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("vaguelyboring"));
// A filter on terms in > quarter of docs should remove vaguelyBoring
assertTokenStreamContents(protectedTokenStream, new String[0]);
}
- public void testAddStopWordsIndexReaderStringFloat() throws Exception {
- protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f);
+ public void testStopwordsPerFieldMaxPercentDocs() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField"), 1f / 2f);
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
// A filter on one Field should not affect queries on another
assertTokenStreamContents(protectedTokenStream, new String[]{"boring"});
- protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField", "repetitiveField"), 1f / 2f);
protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
// A filter on the right Field should affect queries on it
assertTokenStreamContents(protectedTokenStream, new String[0]);
}
- public void testAddStopWordsIndexReaderStringInt() throws Exception {
- int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
+ public void testStopwordsPerFieldMaxDocFreq() throws Exception {
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10);
+ int numStopWords = protectedAnalyzer.getStopWords("repetitiveField").length;
assertTrue("Should have identified stop words", numStopWords > 0);
- Term[] t = protectedAnalyzer.getStopWords();
- assertEquals("num terms should = num stopwords returned", t.length, numStopWords);
-
- int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10);
- assertTrue("Should have identified more stop words", numNewStopWords > 0);
- t = protectedAnalyzer.getStopWords();
- assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField", "variedField"), 10);
+ int numNewStopWords = protectedAnalyzer.getStopWords("repetitiveField").length + protectedAnalyzer.getStopWords("variedField").length;
+ assertTrue("Should have identified more stop words", numNewStopWords > numStopWords);
}
public void testNoFieldNamePollution() throws Exception {
- protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
+ protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10);
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
// Check filter set up OK
@@ -145,8 +130,9 @@
}
public void testTokenStream() throws Exception {
- QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
- a.addStopWords(reader, 10);
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(
+ TEST_VERSION_CURRENT,
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), reader, 10);
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
assertTokenStreamContents(ts, new String[] { "this" });
}