Index: lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java (revision 0)
+++ lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java (revision 0)
@@ -0,0 +1,94 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.queryParser.QueryParser; // for javadoc
+
+/**
+ * Abstract base class for TokenFilters that may remove tokens.
+ * You have to implement {@link #accept} and return a boolean if the current
+ * token should be preserved. {@link #incrementToken} uses this method
+ * to decide if a token should be passed to the caller.
+ */
+public abstract class FilteringTokenFilter extends TokenFilter {
+
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
+
+ public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
+ super(input);
+ this.enablePositionIncrements = enablePositionIncrements;
+ }
+
+ /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
+ protected abstract boolean accept() throws IOException;
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (enablePositionIncrements) {
+ int skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (accept()) {
+ if (skippedPositions != 0) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.getPositionIncrement();
+ }
+ } else {
+ while (input.incrementToken()) {
+ if (accept()) {
+ return true;
+ }
+ }
+ }
+ // reached EOS -- return false
+ return false;
+ }
+
+ /**
+ * @see #setEnablePositionIncrements(boolean)
+ */
+ public boolean getEnablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ /**
+ * If true, this TokenFilter will preserve
+ * positions of the incoming tokens (ie, accumulate and
+ * set position increments of the removed tokens).
+ * Generally, true is best as it does not
+ * lose information (positions of the original tokens)
+ * during indexing.
+ *
+ *
When set, when a token is stopped + * (omitted), the position increment of the following + * token is incremented. + * + *
NOTE: be sure to also + * set {@link QueryParser#setEnablePositionIncrements} if + * you use QueryParser to create queries. + */ + public void setEnablePositionIncrements(boolean enable) { + this.enablePositionIncrements = enable; + } +} Property changes on: lucene\src\java\org\apache\lucene\analysis\FilteringTokenFilter.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/analysis/LengthFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/LengthFilter.java (revision 1065314) +++ lucene/src/java/org/apache/lucene/analysis/LengthFilter.java (working copy) @@ -27,7 +27,7 @@ * Note: Length is calculated as the number of UTF-16 code units. *
*/ -public final class LengthFilter extends TokenFilter { +public final class LengthFilter extends FilteringTokenFilter { private final int min; private final int max; @@ -38,27 +38,24 @@ * Build a filter that removes words that are too long or too * short from the text. */ - public LengthFilter(TokenStream in, int min, int max) - { - super(in); + public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) { + super(enablePositionIncrements, in); this.min = min; this.max = max; } /** - * Returns the next input Token whose term() is the right len + * Build a filter that removes words that are too long or too + * short from the text. + * @deprecated Use {@link #LengthFilter(boolean, TokenStream, int, int) instead. */ + public LengthFilter(TokenStream in, int min, int max) { + this(false, in, min, max); + } + @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - while (input.incrementToken()) { - int len = termAtt.length(); - if (len >= min && len <= max) { - return true; - } - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return false - return false; + public boolean accept() throws IOException { + final int len = termAtt.length(); + return (len >= min && len <= max); } } Index: lucene/src/java/org/apache/lucene/analysis/StopFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/StopFilter.java (revision 1065314) +++ lucene/src/java/org/apache/lucene/analysis/StopFilter.java (working copy) @@ -22,7 +22,6 @@ import java.util.Set; import java.util.List; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Version; @@ -39,13 +38,10 @@ * increments are preserved * */ -public final class StopFilter extends TokenFilter { +public final class StopFilter extends FilteringTokenFilter { private final CharArraySet stopWords; - private boolean enablePositionIncrements = false; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** * Construct a token stream filtering the given input. @@ -101,9 +97,8 @@ * convenience ctor to enable deprecated ctors to set posInc explicitly */ private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set> stopWords, boolean ignoreCase){ - super(input); + super(enablePositionIncrements, input); this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet)stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase); - this.enablePositionIncrements = enablePositionIncrements; } /** @@ -251,20 +246,8 @@ * Returns the next input Token whose term() is not a stop word. */ @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - int skippedPositions = 0; - while (input.incrementToken()) { - if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) { - if (enablePositionIncrements) { - posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); - } - return true; - } - skippedPositions += posIncrAtt.getPositionIncrement(); - } - // reached EOS -- return false - return false; + protected boolean accept() throws IOException { + return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); } /** @@ -279,31 +262,4 @@ public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) { return matchVersion.onOrAfter(Version.LUCENE_29); } - - /** - * @see #setEnablePositionIncrements(boolean) - */ - public boolean getEnablePositionIncrements() { - return enablePositionIncrements; - } - - /** - * Iftrue, this StopFilter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed stop tokens).
- * Generally, true is best as it does not
- * lose information (positions of the original tokens)
- * during indexing.
- *
- * When set, when a token is stopped - * (omitted), the position increment of the following - * token is incremented. - * - *
NOTE: be sure to also
- * set {@link QueryParser#setEnablePositionIncrements} if
- * you use QueryParser to create queries.
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
}
Index: lucene/src/test/org/apache/lucene/analysis/TestLengthFilter.java
===================================================================
--- lucene/src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 1065314)
+++ lucene/src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy)
@@ -22,19 +22,24 @@
public class TestLengthFilter extends BaseTokenStreamTestCase {
- public void testFilter() throws Exception {
+ public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
- LengthFilter filter = new LengthFilter(stream, 2, 6);
- CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ LengthFilter filter = new LengthFilter(false, stream, 2, 6);
+ assertTokenStreamContents(filter,
+ new String[]{"short", "ab", "foo"},
+ new int[]{1, 1, 1}
+ );
+ }
- assertTrue(filter.incrementToken());
- assertEquals("short", termAtt.toString());
- assertTrue(filter.incrementToken());
- assertEquals("ab", termAtt.toString());
- assertTrue(filter.incrementToken());
- assertEquals("foo", termAtt.toString());
- assertFalse(filter.incrementToken());
+ public void testFilterWithPosIncr() throws Exception {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
+ LengthFilter filter = new LengthFilter(true, stream, 2, 6);
+ assertTokenStreamContents(filter,
+ new String[]{"short", "ab", "foo"},
+ new int[]{1, 4, 2}
+ );
}
}
Index: solr/src/java/org/apache/solr/analysis/KeepWordFilter.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/KeepWordFilter.java (revision 1065314)
+++ solr/src/java/org/apache/solr/analysis/KeepWordFilter.java (working copy)
@@ -17,7 +17,7 @@
package org.apache.solr.analysis;
-import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -32,28 +32,25 @@
* @version $Id$
* @since solr 1.3
*/
-public final class KeepWordFilter extends TokenFilter {
+public final class KeepWordFilter extends FilteringTokenFilter {
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- /** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
+ /** @deprecated Use {@link #KeepWordFilter(boolean, TokenStream, CharArraySet)} instead */
@Deprecated
public KeepWordFilter(TokenStream in, Set