Property changes on: .
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/branch_3x:r1065324
Property changes on: lucene
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/branch_3x/lucene:r1065324
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt (revision 1065331)
+++ lucene/CHANGES.txt (working copy)
@@ -636,6 +636,12 @@
Scorer.visitSubScorers (LUCENE-2590) will work correctly.
(Robert Muir, Doron Cohen)
+* LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now
+ require up front specification of enablePositionIncrement. Together with
+ StopFilter they have a common base class (FilteringTokenFilter) that handles
+ the position increments automatically. Implementors only need to override an
+ accept() method that filters tokens. (Uwe Schindler, Robert Muir)
+
Bug fixes
* LUCENE-2249: ParallelMultiSearcher should shut down thread pool on
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (revision 1065331)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (working copy)
@@ -22,10 +22,9 @@
import java.util.List;
import java.util.Set;
-import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version;
@@ -42,15 +41,11 @@
* increments are preserved
*
*/
-public final class StopFilter extends TokenFilter {
+public final class StopFilter extends FilteringTokenFilter {
private final CharArraySet stopWords;
- private boolean enablePositionIncrements = true;
-
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
/**
* Construct a token stream filtering the given input. If
* stopWords is an instance of {@link CharArraySet} (true if
@@ -75,7 +70,7 @@
*/
public StopFilter(Version matchVersion, TokenStream input, Set> stopWords, boolean ignoreCase)
{
- super(input);
+ super(true, input);
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
}
@@ -157,48 +152,8 @@
* Returns the next input Token whose term() is not a stop word.
*/
@Override
- public final boolean incrementToken() throws IOException {
- // return the first non-stop word found
- int skippedPositions = 0;
- while (input.incrementToken()) {
- if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
- if (enablePositionIncrements) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
- return true;
- }
- skippedPositions += posIncrAtt.getPositionIncrement();
- }
- // reached EOS -- return false
- return false;
+ protected boolean accept() throws IOException {
+ return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
}
- /**
- * @see #setEnablePositionIncrements(boolean)
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
- }
-
- /**
- * If true, this StopFilter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed stop tokens).
- * Generally, true is best as it does not
- * lose information (positions of the original tokens)
- * during indexing.
- *
- * Default is true.
- *
- *
When set, when a token is stopped - * (omitted), the position increment of the following - * token is incremented. - * - *
NOTE: be sure to also - * set {@link QueryParser#setEnablePositionIncrements} if - * you use QueryParser to create queries. - */ - public void setEnablePositionIncrements(boolean enable) { - this.enablePositionIncrements = enable; - } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (revision 1065331) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (working copy) @@ -21,31 +21,33 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import java.io.IOException; +import java.util.Set; + /** * A TokenFilter that only keeps tokens with text contained in the * required words. This filter behaves like the inverse of StopFilter. * + * @version $Id$ * @since solr 1.3 */ -public final class KeepWordFilter extends TokenFilter { +public final class KeepWordFilter extends FilteringTokenFilter { private final CharArraySet words; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** The words set passed to this constructor will be directly used by this filter * and should not be modified, */ - public KeepWordFilter(TokenStream in, CharArraySet words) { - super(in); + public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) { + super(enablePositionIncrements, in); this.words = words; } @Override - public boolean incrementToken() throws IOException { - while (input.incrementToken()) { - if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true; - } - return false; + public boolean accept() throws IOException { + return words.contains(termAtt.buffer(), 0, termAtt.length()); } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (revision 1065331) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** @@ -29,7 +30,7 @@ * Note: Length is calculated as the number of UTF-16 code units. *
*/ -public final class LengthFilter extends TokenFilter { +public final class LengthFilter extends FilteringTokenFilter { private final int min; private final int max; @@ -40,27 +41,15 @@ * Build a filter that removes words that are too long or too * short from the text. */ - public LengthFilter(TokenStream in, int min, int max) - { - super(in); + public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) { + super(enablePositionIncrements, in); this.min = min; this.max = max; } - /** - * Returns the next input Token whose term() is the right len - */ @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - while (input.incrementToken()) { - int len = termAtt.length(); - if (len >= min && len <= max) { - return true; - } - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return false - return false; + public boolean accept() throws IOException { + final int len = termAtt.length(); + return (len >= min && len <= max); } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (revision 1065324) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,6 +19,8 @@ import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.queryParser.QueryParser; // for javadoc Index: modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (revision 1065331) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (working copy) @@ -35,16 +35,26 @@ words.add( "aaa" ); words.add( "bbb" ); - String input = "aaa BBB ccc ddd EEE"; + String input = "xxx yyy aaa zzz BBB ccc ddd EEE"; // Test Stopwords TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); - assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); + stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 }); // Now force case stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); - assertTokenStreamContents(stream, new String[] { "aaa" }); + stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); + assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 }); + + // Test Stopwords + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 }); + + // Now force case + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); + assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 }); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (revision 1065331) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (working copy) @@ -24,19 +24,24 @@ public class TestLengthFilter extends BaseTokenStreamTestCase { - public void testFilter() throws Exception { + public void testFilterNoPosIncr() throws Exception { TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("short toolong evenmuchlongertext a ab toolong foo")); - LengthFilter filter = new LengthFilter(stream, 2, 6); - CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); + LengthFilter filter = new LengthFilter(false, stream, 2, 6); + assertTokenStreamContents(filter, + new String[]{"short", "ab", "foo"}, + new int[]{1, 1, 1} + ); + } - assertTrue(filter.incrementToken()); - assertEquals("short", termAtt.toString()); - assertTrue(filter.incrementToken()); - assertEquals("ab", termAtt.toString()); - assertTrue(filter.incrementToken()); - assertEquals("foo", termAtt.toString()); - assertFalse(filter.incrementToken()); + public void testFilterWithPosIncr() throws Exception { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("short toolong evenmuchlongertext a ab toolong foo")); + LengthFilter filter = new LengthFilter(true, stream, 2, 6); + assertTokenStreamContents(filter, + new String[]{"short", "ab", "foo"}, + new int[]{1, 4, 2} + ); } } Property changes on: solr ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/branch_3x/solr:r1065324 Index: solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (revision 1065331) +++ solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (working copy) @@ -23,22 +23,27 @@ import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import org.apache.lucene.analysis.util.CharArraySet; +import java.util.Map; import java.util.Set; import java.io.IOException; /** * @version $Id$ - * @since solr 1.3 */ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { - private CharArraySet words; - private boolean ignoreCase; + @Override + public void init(Map