Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountAnalyzer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountAnalyzer.java (revision 1398838) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountAnalyzer.java (working copy) @@ -27,13 +27,27 @@ public final class LimitTokenCountAnalyzer extends AnalyzerWrapper { private final Analyzer delegate; private final int maxTokenCount; + private final boolean consumeAllTokens; /** * Build an analyzer that limits the maximum number of tokens per field. + * This analyzer will not consume any tokens beyond the maxTokenCount limit + * + * @see #LimitTokenCountAnalyzer(Analyzer,int,boolean) */ public LimitTokenCountAnalyzer(Analyzer delegate, int maxTokenCount) { + this(delegate, maxTokenCount, false); + } + /** + * Build an analyzer that limits the maximum number of tokens per field. + * @param delegate the analyzer to wrap + * @param maxTokenCount max number of tokens to produce + * @param consumeAllTokens whether all tokens from the delegate should be consumed even if maxTokenCount is reached. + */ + public LimitTokenCountAnalyzer(Analyzer delegate, int maxTokenCount, boolean consumeAllTokens) { this.delegate = delegate; this.maxTokenCount = maxTokenCount; + this.consumeAllTokens = consumeAllTokens; } @Override @@ -44,11 +58,11 @@ @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { return new TokenStreamComponents(components.getTokenizer(), - new LimitTokenCountFilter(components.getTokenStream(), maxTokenCount)); + new LimitTokenCountFilter(components.getTokenStream(), maxTokenCount, consumeAllTokens)); } @Override public String toString() { - return "LimitTokenCountAnalyzer(" + delegate.toString() + ", maxTokenCount=" + maxTokenCount + ")"; + return "LimitTokenCountAnalyzer(" + delegate.toString() + ", maxTokenCount=" + maxTokenCount + ", consumeAllTokens=" + consumeAllTokens + ")"; } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java (revision 1398838) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java (working copy) @@ -29,28 +29,54 @@ public final class LimitTokenCountFilter extends TokenFilter { private final int maxTokenCount; + private final boolean consumeAllTokens; private int tokenCount = 0; + private boolean exhausted = false; /** * Build a filter that only accepts tokens up to a maximum number. + * This filter will not consume any tokens beyond the maxTokenCount limit + * + * @see #LimitTokenCountFilter(TokenStream,int,boolean) */ public LimitTokenCountFilter(TokenStream in, int maxTokenCount) { + this(in, maxTokenCount, false); + } + + /** + * Build an filter that limits the maximum number of tokens per field. + * @param in the stream to wrap + * @param maxTokenCount max number of tokens to produce + * @param consumeAllTokens whether all tokens from the input must be consumed even if maxTokenCount is reached. + */ + public LimitTokenCountFilter(TokenStream in, int maxTokenCount, boolean consumeAllTokens) { super(in); this.maxTokenCount = maxTokenCount; + this.consumeAllTokens = consumeAllTokens; } @Override public boolean incrementToken() throws IOException { - if (tokenCount < maxTokenCount && input.incrementToken()) { - tokenCount++; - return true; + if (exhausted) { + return false; + } else if (tokenCount < maxTokenCount) { + if (input.incrementToken()) { + tokenCount++; + return true; + } else { + exhausted = true; + return false; + } + } else { + while (consumeAllTokens && input.incrementToken()) { /* NOOP */ } + return false; } - return false; } @Override public void reset() throws IOException { super.reset(); tokenCount = 0; + exhausted = false; } } Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java (revision 1398838) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenCountAnalyzer.java (working copy) @@ -33,40 +33,64 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; +import org.apache.lucene.util._TestUtil; public class TestLimitTokenCountAnalyzer extends BaseTokenStreamTestCase { public void testLimitTokenCountAnalyzer() throws IOException { - Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2); - // dont use assertAnalyzesTo here, as the end offset is not the end of the string! - assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4); - assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3); + for (boolean consumeAll : new boolean[] { true, false }) { + MockAnalyzer mock = new MockAnalyzer(random()); + + // if we are consuming all tokens, we can use the checks, + // otherwise we can't + mock.setEnableChecks(consumeAll); + Analyzer a = new LimitTokenCountAnalyzer(mock, 2, consumeAll); - a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2); - // dont use assertAnalyzesTo here, as the end offset is not the end of the string! - assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3); + // dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)! + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : 5); + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : 4); + + // less than the limit, ensure we behave correctly + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 ")), new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, 3); + + // equal to limit + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 ")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : 5); + } } public void testLimitTokenCountIndexWriter() throws IOException { - Directory dir = newDirectory(); + + for (boolean consumeAll : new boolean[] { true, false }) { + Directory dir = newDirectory(); + int limit = _TestUtil.nextInt(random(), 50, 101000); + MockAnalyzer mock = new MockAnalyzer(random()); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( - TEST_VERSION_CURRENT, new LimitTokenCountAnalyzer(new MockAnalyzer(random()), 100000))); + // if we are consuming all tokens, we can use the checks, + // otherwise we can't + mock.setEnableChecks(consumeAll); + Analyzer a = new LimitTokenCountAnalyzer(mock, limit, consumeAll); - Document doc = new Document(); - StringBuilder b = new StringBuilder(); - for(int i=0;i<10000;i++) - b.append(" a"); - b.append(" x"); - doc.add(newTextField("field", b.toString(), Field.Store.NO)); - writer.addDocument(doc); - writer.close(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig + (TEST_VERSION_CURRENT, a)); - IndexReader reader = DirectoryReader.open(dir); - Term t = new Term("field", "x"); - assertEquals(1, reader.docFreq(t)); - reader.close(); - dir.close(); + Document doc = new Document(); + StringBuilder b = new StringBuilder(); + for(int i=1;i