Index: lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java (revision 0) +++ lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java (revision 0) @@ -0,0 +1,94 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.queryParser.QueryParser; // for javadoc + +/** + * Abstract base class for TokenFilters that may remove tokens. + * You have to implement {@link #accept} and return a boolean if the current + * token should be preserved. {@link #incrementToken} uses this method + * to decide if a token should be passed to the caller. + */ +public abstract class FilteringTokenFilter extends TokenFilter { + + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value! + + public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){ + super(input); + this.enablePositionIncrements = enablePositionIncrements; + } + + /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */ + protected abstract boolean accept() throws IOException; + + @Override + public final boolean incrementToken() throws IOException { + if (enablePositionIncrements) { + int skippedPositions = 0; + while (input.incrementToken()) { + if (accept()) { + if (skippedPositions != 0) { + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); + } + return true; + } + skippedPositions += posIncrAtt.getPositionIncrement(); + } + } else { + while (input.incrementToken()) { + if (accept()) { + return true; + } + } + } + // reached EOS -- return false + return false; + } + + /** + * @see #setEnablePositionIncrements(boolean) + */ + public boolean getEnablePositionIncrements() { + return enablePositionIncrements; + } + + /** + * If true, this TokenFilter will preserve + * positions of the incoming tokens (ie, accumulate and + * set position increments of the removed tokens). + * Generally, true is best as it does not + * lose information (positions of the original tokens) + * during indexing. + * + *

When set, when a token is stopped + * (omitted), the position increment of the following + * token is incremented. + * + *

NOTE: be sure to also + * set {@link QueryParser#setEnablePositionIncrements} if + * you use QueryParser to create queries. + */ + public void setEnablePositionIncrements(boolean enable) { + this.enablePositionIncrements = enable; + } +} Property changes on: lucene\src\java\org\apache\lucene\analysis\FilteringTokenFilter.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/analysis/LengthFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/LengthFilter.java (revision 1065314) +++ lucene/src/java/org/apache/lucene/analysis/LengthFilter.java (working copy) @@ -27,7 +27,7 @@ * Note: Length is calculated as the number of UTF-16 code units. *

*/ -public final class LengthFilter extends TokenFilter { +public final class LengthFilter extends FilteringTokenFilter { private final int min; private final int max; @@ -38,27 +38,24 @@ * Build a filter that removes words that are too long or too * short from the text. */ - public LengthFilter(TokenStream in, int min, int max) - { - super(in); + public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) { + super(enablePositionIncrements, in); this.min = min; this.max = max; } /** - * Returns the next input Token whose term() is the right len + * Build a filter that removes words that are too long or too + * short from the text. + * @deprecated Use {@link #LengthFilter(boolean, TokenStream, int, int) instead. */ + public LengthFilter(TokenStream in, int min, int max) { + this(false, in, min, max); + } + @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - while (input.incrementToken()) { - int len = termAtt.length(); - if (len >= min && len <= max) { - return true; - } - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return false - return false; + public boolean accept() throws IOException { + final int len = termAtt.length(); + return (len >= min && len <= max); } } Index: lucene/src/java/org/apache/lucene/analysis/StopFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/StopFilter.java (revision 1065314) +++ lucene/src/java/org/apache/lucene/analysis/StopFilter.java (working copy) @@ -22,7 +22,6 @@ import java.util.Set; import java.util.List; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Version; @@ -39,13 +38,10 @@ * increments are preserved * */ -public final class StopFilter extends TokenFilter { +public final class StopFilter extends FilteringTokenFilter { private final CharArraySet stopWords; - private boolean enablePositionIncrements = false; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** * Construct a token stream filtering the given input. @@ -101,9 +97,8 @@ * convenience ctor to enable deprecated ctors to set posInc explicitly */ private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set stopWords, boolean ignoreCase){ - super(input); + super(enablePositionIncrements, input); this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet)stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase); - this.enablePositionIncrements = enablePositionIncrements; } /** @@ -251,20 +246,8 @@ * Returns the next input Token whose term() is not a stop word. */ @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - int skippedPositions = 0; - while (input.incrementToken()) { - if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) { - if (enablePositionIncrements) { - posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); - } - return true; - } - skippedPositions += posIncrAtt.getPositionIncrement(); - } - // reached EOS -- return false - return false; + protected boolean accept() throws IOException { + return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); } /** @@ -279,31 +262,4 @@ public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) { return matchVersion.onOrAfter(Version.LUCENE_29); } - - /** - * @see #setEnablePositionIncrements(boolean) - */ - public boolean getEnablePositionIncrements() { - return enablePositionIncrements; - } - - /** - * If true, this StopFilter will preserve - * positions of the incoming tokens (ie, accumulate and - * set position increments of the removed stop tokens). - * Generally, true is best as it does not - * lose information (positions of the original tokens) - * during indexing. - * - *

When set, when a token is stopped - * (omitted), the position increment of the following - * token is incremented. - * - *

NOTE: be sure to also - * set {@link QueryParser#setEnablePositionIncrements} if - * you use QueryParser to create queries. - */ - public void setEnablePositionIncrements(boolean enable) { - this.enablePositionIncrements = enable; - } } Index: lucene/src/test/org/apache/lucene/analysis/TestLengthFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 1065314) +++ lucene/src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy) @@ -22,19 +22,24 @@ public class TestLengthFilter extends BaseTokenStreamTestCase { - public void testFilter() throws Exception { + public void testFilterNoPosIncr() throws Exception { TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("short toolong evenmuchlongertext a ab toolong foo")); - LengthFilter filter = new LengthFilter(stream, 2, 6); - CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); + LengthFilter filter = new LengthFilter(false, stream, 2, 6); + assertTokenStreamContents(filter, + new String[]{"short", "ab", "foo"}, + new int[]{1, 1, 1} + ); + } - assertTrue(filter.incrementToken()); - assertEquals("short", termAtt.toString()); - assertTrue(filter.incrementToken()); - assertEquals("ab", termAtt.toString()); - assertTrue(filter.incrementToken()); - assertEquals("foo", termAtt.toString()); - assertFalse(filter.incrementToken()); + public void testFilterWithPosIncr() throws Exception { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("short toolong evenmuchlongertext a ab toolong foo")); + LengthFilter filter = new LengthFilter(true, stream, 2, 6); + assertTokenStreamContents(filter, + new String[]{"short", "ab", "foo"}, + new int[]{1, 4, 2} + ); } } Index: solr/src/java/org/apache/solr/analysis/KeepWordFilter.java =================================================================== --- solr/src/java/org/apache/solr/analysis/KeepWordFilter.java (revision 1065314) +++ solr/src/java/org/apache/solr/analysis/KeepWordFilter.java (working copy) @@ -17,7 +17,7 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -32,28 +32,25 @@ * @version $Id$ * @since solr 1.3 */ -public final class KeepWordFilter extends TokenFilter { +public final class KeepWordFilter extends FilteringTokenFilter { private final CharArraySet words; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - /** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */ + /** @deprecated Use {@link #KeepWordFilter(boolean, TokenStream, CharArraySet)} instead */ @Deprecated public KeepWordFilter(TokenStream in, Set words, boolean ignoreCase ) { - this(in, new CharArraySet(words, ignoreCase)); + this(false, in, new CharArraySet(words, ignoreCase)); } /** The words set passed to this constructor will be directly used by this filter * and should not be modified, */ - public KeepWordFilter(TokenStream in, CharArraySet words) { - super(in); + public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) { + super(enablePositionIncrements, in); this.words = words; } @Override - public boolean incrementToken() throws IOException { - while (input.incrementToken()) { - if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true; - } - return false; + public boolean accept() throws IOException { + return words.contains(termAtt.buffer(), 0, termAtt.length()); } } Index: solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (revision 1065314) +++ solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (working copy) @@ -19,25 +19,32 @@ import org.apache.solr.common.ResourceLoader; import org.apache.solr.util.plugin.ResourceLoaderAware; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; +import java.util.Map; import java.util.Set; import java.io.IOException; /** * @version $Id$ - * @since solr 1.3 */ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { - private CharArraySet words; - private boolean ignoreCase; + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } public void inform(ResourceLoader loader) { String wordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); - if (wordFiles != null) { + enablePositionIncrements = getBoolean("enablePositionIncrements",false); + + if (wordFiles != null) { try { words = getWordSet(loader, wordFiles, ignoreCase); } catch (IOException e) { @@ -46,6 +53,10 @@ } } + private CharArraySet words; + private boolean ignoreCase; + private boolean enablePositionIncrements; + /** * Set the keep word list. * NOTE: if ignoreCase==true, the words are expected to be lowercase @@ -61,15 +72,19 @@ this.ignoreCase = ignoreCase; } - public KeepWordFilter create(TokenStream input) { - return new KeepWordFilter(input, words); + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; } + public boolean isIgnoreCase() { + return ignoreCase; + } + public CharArraySet getWords() { return words; } - public boolean isIgnoreCase() { - return ignoreCase; + public KeepWordFilter create(TokenStream input) { + return new KeepWordFilter(enablePositionIncrements, input, words); } } Index: solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java (revision 1065314) +++ solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java (working copy) @@ -27,6 +27,7 @@ */ public class LengthFilterFactory extends BaseTokenFilterFactory { int min,max; + boolean enablePositionIncrements; public static final String MIN_KEY = "min"; public static final String MAX_KEY = "max"; @@ -35,8 +36,10 @@ super.init(args); min=Integer.parseInt(args.get(MIN_KEY)); max=Integer.parseInt(args.get(MAX_KEY)); + enablePositionIncrements = getBoolean("enablePositionIncrements",false); } + public LengthFilter create(TokenStream input) { - return new LengthFilter(input,min,max); + return new LengthFilter(enablePositionIncrements, input,min,max); } } Index: solr/src/test/org/apache/solr/analysis/LengthFilterTest.java =================================================================== --- solr/src/test/org/apache/solr/analysis/LengthFilterTest.java (revision 1065314) +++ solr/src/test/org/apache/solr/analysis/LengthFilterTest.java (working copy) @@ -31,9 +31,19 @@ Map args = new HashMap(); args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + // default: args.put("enablePositionIncrements", "false"); factory.init(args); String test = "foo foobar super-duper-trooper"; TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test))); - assertTokenStreamContents(stream, new String[] { "foobar" }); + assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 }); + + factory = new LengthFilterFactory(); + args = new HashMap(); + args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); + args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + args.put("enablePositionIncrements", "true"); + factory.init(args); + stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test))); + assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 }); } } \ No newline at end of file Index: solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java (revision 1065314) +++ solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java (working copy) @@ -40,19 +40,21 @@ words.add( "aaa" ); words.add( "bbb" ); - String input = "aaa BBB ccc ddd EEE"; + String input = "xxx yyy aaa zzz BBB ccc ddd EEE"; Map args = new HashMap(DEFAULT_VERSION_PARAM); ResourceLoader loader = new SolrResourceLoader(null, null); // Test Stopwords KeepWordFilterFactory factory = new KeepWordFilterFactory(); args.put( "ignoreCase", "true" ); + args.put( "enablePositionIncrements", "true" ); factory.init( args ); factory.inform( loader ); factory.setWords( words ); assertTrue(factory.isIgnoreCase()); + assertTrue(factory.isEnablePositionIncrements()); TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); - assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 }); // Test Stopwords (ignoreCase via the setter instead) factory = new KeepWordFilterFactory(); @@ -62,18 +64,21 @@ factory.setIgnoreCase(true); factory.setWords( words ); assertTrue(factory.isIgnoreCase()); + assertFalse(factory.isEnablePositionIncrements()); stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); - assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 }); - // Now force case + // Now force case and posIncr factory = new KeepWordFilterFactory(); args = new HashMap(DEFAULT_VERSION_PARAM); args.put( "ignoreCase", "false" ); + args.put( "enablePositionIncrements", "true" ); factory.init( args ); factory.inform( loader ); factory.setWords( words ); assertFalse(factory.isIgnoreCase()); + assertTrue(factory.isEnablePositionIncrements()); stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); - assertTokenStreamContents(stream, new String[] { "aaa" }); + assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 }); } }