Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java (working copy) @@ -0,0 +1,100 @@ +package org.apache.lucene.analysis.miscellaneous; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +import java.io.IOException; + +/** + * This TokenFilter limits its emitted tokens to those with positions that + * are not greater than the configured limit. + *

+ * By default, this filter ignores any tokens in the wrapped {@code TokenStream} + * once the limit has been exceeded, which can result in {@code reset()} being + * called prior to {@code incrementToken()} returning {@code false}. For most + * {@code TokenStream} implementations this should be acceptable, and faster + * then consuming the full stream. If you are wrapping a {@code TokenStream} + * which requires that the full stream of tokens be exhausted in order to + * function properly, use the + * {@link #LimitTokenPositionFilter(TokenStream,int,boolean) consumeAllTokens} + * option. + */ +public final class LimitTokenPositionFilter extends TokenFilter { + + private final int maxTokenPosition; + private final boolean consumeAllTokens; + private int tokenPosition = 0; + private boolean exhausted = false; + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + /** + * Build a filter that only accepts tokens up to and including the given maximum position. + * This filter will not consume any tokens with position greater than the maxTokenPosition limit. + + * @param in the stream to wrap + * @param maxTokenPosition max position of tokens to produce (1st token always has position 1) + * + * @see #LimitTokenPositionFilter(TokenStream,int,boolean) + */ + public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) { + this(in, maxTokenPosition, false); + } + + /** + * Build a filter that limits the maximum position of tokens to emit. + * + * @param in the stream to wrap + * @param maxTokenPosition max position of tokens to produce (1st token always has position 1) + * @param consumeAllTokens whether all tokens from the wrapped input stream must be consumed + * even if maxTokenPosition is exceeded. + */ + public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) { + super(in); + this.maxTokenPosition = maxTokenPosition; + this.consumeAllTokens = consumeAllTokens; + } + + @Override + public boolean incrementToken() throws IOException { + if (exhausted) { + return false; + } + if (input.incrementToken()) { + tokenPosition += posIncAtt.getPositionIncrement(); + if (tokenPosition <= maxTokenPosition) { + return true; + } else { + while (consumeAllTokens && input.incrementToken()) { /* NOOP */ } + exhausted = true; + return false; + } + } else { + exhausted = true; + return false; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + tokenPosition = 0; + exhausted = false; + } +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java (working copy) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis.miscellaneous; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link LimitTokenPositionFilter}. + *

+ * <fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.LimitTokenPositionFilterFactory" maxTokenPosition="3" consumeAllTokens="false" />
+ *   </analyzer>
+ * </fieldType>
+ *

+ * The {@code consumeAllTokens} property is optional and defaults to {@code false}. + * See {@link LimitTokenPositionFilter} for an explanation of its use. + */ +public class LimitTokenPositionFilterFactory extends TokenFilterFactory { + + public static final String MAX_TOKEN_POSITION_KEY = "maxTokenPosition"; + public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens"; + int maxTokenPosition; + boolean consumeAllTokens; + + @Override + public void init(Map args) { + super.init(args); + maxTokenPosition = getInt(MAX_TOKEN_POSITION_KEY); + consumeAllTokens = getBoolean(CONSUME_ALL_TOKENS_KEY, false); + } + + @Override + public TokenStream create(TokenStream input) { + return new LimitTokenPositionFilter(input, maxTokenPosition, consumeAllTokens); + } + +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 1457376) +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -61,6 +61,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory org.apache.lucene.analysis.miscellaneous.LengthFilterFactory org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory +org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory org.apache.lucene.analysis.miscellaneous.TrimFilterFactory Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java (working copy) @@ -0,0 +1,84 @@ +package org.apache.lucene.analysis.miscellaneous; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.CharsRef; + +public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase { + + public void testMaxPosition2() throws IOException { + for (final boolean consumeAll : new boolean[] { true, false }) { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + // if we are consuming all tokens, we can use the checks, otherwise we can't + tokenizer.setEnableChecks(consumeAll); + return new TokenStreamComponents(tokenizer, new LimitTokenPositionFilter(tokenizer, 2, consumeAll)); + } + }; + + // dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)! + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), + new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null); + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), + new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null); + + // less than the limit, ensure we behave correctly + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 ")), + new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null); + + // equal to limit + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 ")), + new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null); + } + } + + public void testMaxPosition3WithSynomyms() throws IOException { + MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); + tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps + + SynonymMap.Builder builder = new SynonymMap.Builder(true); + builder.add(new CharsRef("one"), new CharsRef("first"), true); + builder.add(new CharsRef("one"), new CharsRef("alpha"), true); + builder.add(new CharsRef("one"), new CharsRef("beguine"), true); + CharsRef multiWordCharsRef = new CharsRef(); + SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef); + builder.add(new CharsRef("one"), multiWordCharsRef, true); + SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); + builder.add(new CharsRef("two"), multiWordCharsRef, true); + SynonymMap synonymMap = builder.build(); + TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); + stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false + + // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. + assertTokenStreamContents(stream, + new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, + new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 }); + + } +} Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java (working copy) @@ -0,0 +1,84 @@ +package org.apache.lucene.analysis.miscellaneous; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.shingle.ShingleFilter; + +public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamTestCase { + + public void testMaxPosition1() throws IOException { + LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory(); + Map args = new HashMap(); + args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1"); + factory.init(args); + String test = "A1 B2 C3 D4 E5 F6"; + MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false); + // LimitTokenPositionFilter doesn't consume the entire stream that it wraps + tok.setEnableChecks(false); + TokenStream stream = factory.create(tok); + assertTokenStreamContents(stream, new String[] { "A1" }); + } + + public void testMissingParam() { + LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory(); + Map args = new HashMap(); + IllegalArgumentException iae = null; + try { + factory.init(args); + } catch (IllegalArgumentException e) { + assertTrue("exception doesn't mention param: " + e.getMessage(), + 0 < e.getMessage().indexOf(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY)); + iae = e; + } + assertNotNull("no exception thrown", iae); + } + + public void testMaxPosition1WithShingles() throws IOException { + LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory(); + Map args = new HashMap(); + args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1"); + factory.init(args); + String input = "one two three four five"; + MockTokenizer tok = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + // LimitTokenPositionFilter doesn't consume the entire stream that it wraps + tok.setEnableChecks(false); + ShingleFilter shingleFilter = new ShingleFilter(tok, 2, 3); + shingleFilter.setOutputUnigrams(true); + TokenStream stream = factory.create(shingleFilter); + assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" }); + } + + public void testConsumeAllTokens() throws IOException { + LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory(); + Map args = new HashMap(); + args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "3"); + args.put(LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, "true"); + factory.init(args); + String test = "A1 B2 C3 D4 E5 F6"; + MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false); + TokenStream stream = factory.create(tok); + assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" }); + } +} Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property