Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java (revision 0)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java (working copy)
@@ -0,0 +1,100 @@
+package org.apache.lucene.analysis.miscellaneous;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+import java.io.IOException;
+
+/**
+ * This TokenFilter limits its emitted tokens to those with positions that
+ * are not greater than the configured limit.
+ *
+ * By default, this filter ignores any tokens in the wrapped {@code TokenStream}
+ * once the limit has been exceeded, which can result in {@code reset()} being
+ * called prior to {@code incrementToken()} returning {@code false}. For most
+ * {@code TokenStream} implementations this should be acceptable, and faster
+ * then consuming the full stream. If you are wrapping a {@code TokenStream}
+ * which requires that the full stream of tokens be exhausted in order to
+ * function properly, use the
+ * {@link #LimitTokenPositionFilter(TokenStream,int,boolean) consumeAllTokens}
+ * option.
+ */
+public final class LimitTokenPositionFilter extends TokenFilter {
+
+ private final int maxTokenPosition;
+ private final boolean consumeAllTokens;
+ private int tokenPosition = 0;
+ private boolean exhausted = false;
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ /**
+ * Build a filter that only accepts tokens up to and including the given maximum position.
+ * This filter will not consume any tokens with position greater than the maxTokenPosition limit.
+
+ * @param in the stream to wrap
+ * @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
+ *
+ * @see #LimitTokenPositionFilter(TokenStream,int,boolean)
+ */
+ public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) {
+ this(in, maxTokenPosition, false);
+ }
+
+ /**
+ * Build a filter that limits the maximum position of tokens to emit.
+ *
+ * @param in the stream to wrap
+ * @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
+ * @param consumeAllTokens whether all tokens from the wrapped input stream must be consumed
+ * even if maxTokenPosition is exceeded.
+ */
+ public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) {
+ super(in);
+ this.maxTokenPosition = maxTokenPosition;
+ this.consumeAllTokens = consumeAllTokens;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (exhausted) {
+ return false;
+ }
+ if (input.incrementToken()) {
+ tokenPosition += posIncAtt.getPositionIncrement();
+ if (tokenPosition <= maxTokenPosition) {
+ return true;
+ } else {
+ while (consumeAllTokens && input.incrementToken()) { /* NOOP */ }
+ exhausted = true;
+ return false;
+ }
+ } else {
+ exhausted = true;
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ tokenPosition = 0;
+ exhausted = false;
+ }
+}
Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java (revision 0)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java (working copy)
@@ -0,0 +1,56 @@
+package org.apache.lucene.analysis.miscellaneous;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link LimitTokenPositionFilter}.
+ *
+ * <fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.LimitTokenPositionFilterFactory" maxTokenPosition="3" consumeAllTokens="false" />
+ * </analyzer>
+ * </fieldType>
+ *
+ * The {@code consumeAllTokens} property is optional and defaults to {@code false}.
+ * See {@link LimitTokenPositionFilter} for an explanation of its use.
+ */
+public class LimitTokenPositionFilterFactory extends TokenFilterFactory {
+
+ public static final String MAX_TOKEN_POSITION_KEY = "maxTokenPosition";
+ public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
+ int maxTokenPosition;
+ boolean consumeAllTokens;
+
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ maxTokenPosition = getInt(MAX_TOKEN_POSITION_KEY);
+ consumeAllTokens = getBoolean(CONSUME_ALL_TOKENS_KEY, false);
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new LimitTokenPositionFilter(input, maxTokenPosition, consumeAllTokens);
+ }
+
+}
Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
===================================================================
--- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 1457376)
+++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy)
@@ -61,6 +61,7 @@
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
+org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java (revision 0)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java (working copy)
@@ -0,0 +1,84 @@
+package org.apache.lucene.analysis.miscellaneous;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.util.CharsRef;
+
+public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
+
+ public void testMaxPosition2() throws IOException {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ // if we are consuming all tokens, we can use the checks, otherwise we can't
+ tokenizer.setEnableChecks(consumeAll);
+ return new TokenStreamComponents(tokenizer, new LimitTokenPositionFilter(tokenizer, 2, consumeAll));
+ }
+ };
+
+ // dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
+ new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null);
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
+ new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null);
+
+ // less than the limit, ensure we behave correctly
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 ")),
+ new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null);
+
+ // equal to limit
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 ")),
+ new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null);
+ }
+ }
+
+ public void testMaxPosition3WithSynomyms() throws IOException {
+ MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
+ tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
+
+ SynonymMap.Builder builder = new SynonymMap.Builder(true);
+ builder.add(new CharsRef("one"), new CharsRef("first"), true);
+ builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
+ builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
+ CharsRef multiWordCharsRef = new CharsRef();
+ SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
+ builder.add(new CharsRef("one"), multiWordCharsRef, true);
+ SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
+ builder.add(new CharsRef("two"), multiWordCharsRef, true);
+ SynonymMap synonymMap = builder.build();
+ TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
+ stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
+
+ // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
+ assertTokenStreamContents(stream,
+ new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" },
+ new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
+
+ }
+}
Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilter.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java (revision 0)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java (working copy)
@@ -0,0 +1,84 @@
+package org.apache.lucene.analysis.miscellaneous;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+
+public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamTestCase {
+
+ public void testMaxPosition1() throws IOException {
+ LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
+ Map args = new HashMap();
+ args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
+ factory.init(args);
+ String test = "A1 B2 C3 D4 E5 F6";
+ MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
+ // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
+ tok.setEnableChecks(false);
+ TokenStream stream = factory.create(tok);
+ assertTokenStreamContents(stream, new String[] { "A1" });
+ }
+
+ public void testMissingParam() {
+ LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
+ Map args = new HashMap();
+ IllegalArgumentException iae = null;
+ try {
+ factory.init(args);
+ } catch (IllegalArgumentException e) {
+ assertTrue("exception doesn't mention param: " + e.getMessage(),
+ 0 < e.getMessage().indexOf(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY));
+ iae = e;
+ }
+ assertNotNull("no exception thrown", iae);
+ }
+
+ public void testMaxPosition1WithShingles() throws IOException {
+ LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
+ Map args = new HashMap();
+ args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
+ factory.init(args);
+ String input = "one two three four five";
+ MockTokenizer tok = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
+ // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
+ tok.setEnableChecks(false);
+ ShingleFilter shingleFilter = new ShingleFilter(tok, 2, 3);
+ shingleFilter.setOutputUnigrams(true);
+ TokenStream stream = factory.create(shingleFilter);
+ assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" });
+ }
+
+ public void testConsumeAllTokens() throws IOException {
+ LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
+ Map args = new HashMap();
+ args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "3");
+ args.put(LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, "true");
+ factory.init(args);
+ String test = "A1 B2 C3 D4 E5 F6";
+ MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
+ TokenStream stream = factory.create(tok);
+ assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
+ }
+}
Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenPositionFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property