Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 919741) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -160,7 +160,7 @@ this(Version.LUCENE_30, factory, input); } - private int offset = 0, bufferIndex = 0, dataLen = 0; + private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; @@ -274,17 +274,19 @@ if(useOldAPI) // TODO remove this in LUCENE 4.0 return incrementTokenOld(); int length = 0; - int start = bufferIndex; + int start = -1; // this variable is always initialized char[] buffer = termAtt.termBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils dataLen = 0; // so next offset += dataLen won't decrement offset - if (length > 0) + if (length > 0) { break; - else + } else { + finalOffset = correctOffset(offset); return false; + } } dataLen = ioBuffer.getLength(); bufferIndex = 0; @@ -294,10 +296,12 @@ bufferIndex += Character.charCount(c); if (isTokenChar(c)) { // if it's a token char - if (length == 0) // start of token + if (length == 0) { // start of token + assert start == -1; start = offset + bufferIndex - 1; - else if (length >= buffer.length-1) // check if a supplementary could run out of bounds + } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeTermBuffer(2+length); // make sure a supplementary fits in the buffer + } length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test break; @@ -306,7 +310,8 @@ } termAtt.setTermLength(length); - offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); + assert start != -1; + offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length)); return true; } @@ -319,7 +324,7 @@ @Deprecated private boolean incrementTokenOld() throws IOException { int length = 0; - int start = bufferIndex; + int start = -1; // this variable is always initialized char[] buffer = termAtt.termBuffer(); final char[] oldIoBuffer = ioBuffer.getBuffer(); while (true) { @@ -329,10 +334,12 @@ dataLen = input.read(oldIoBuffer); if (dataLen == -1) { dataLen = 0; // so next offset += dataLen won't decrement offset - if (length > 0) + if (length > 0) { break; - else + } else { + finalOffset = correctOffset(offset); return false; + } } bufferIndex = 0; } @@ -341,10 +348,12 @@ if (isTokenChar(c)) { // if it's a token char - if (length == 0) // start of token + if (length == 0) { // start of token + assert start == -1; start = offset + bufferIndex - 1; - else if (length == buffer.length) + } else if (length == buffer.length) { buffer = termAtt.resizeTermBuffer(1+length); + } buffer[length++] = normalize(c); // buffer it, normalized @@ -356,6 +365,7 @@ } termAtt.setTermLength(length); + assert start != -1; offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } @@ -365,7 +375,6 @@ @Override public final void end() { // set final offset - final int finalOffset = correctOffset(offset); offsetAtt.setOffset(finalOffset, finalOffset); } @@ -375,6 +384,7 @@ bufferIndex = 0; offset = 0; dataLen = 0; + finalOffset = 0; ioBuffer.reset(); // make sure to reset the IO buffer!! } Index: src/java/org/apache/lucene/analysis/LimitTokenCountAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/LimitTokenCountAnalyzer.java (revision 0) +++ src/java/org/apache/lucene/analysis/LimitTokenCountAnalyzer.java (revision 0) @@ -0,0 +1,69 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Fieldable; + +import java.io.Reader; +import java.io.IOException; + +/** + * This Analyzer limits the number of tokens while indexing. It is + * a replacement for the maximum field length setting inside {@link org.apache.lucene.index.IndexWriter}. + */ +public final class LimitTokenCountAnalyzer extends Analyzer { + private final Analyzer delegate; + private final int maxTokenCount; + + /** + * Build an analyzer that limits the maximum number of tokens per field. + */ + public LimitTokenCountAnalyzer(Analyzer delegate, int maxTokenCount) { + this.delegate = delegate; + this.maxTokenCount = maxTokenCount; + } + + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + return new LimitTokenCountFilter( + delegate.tokenStream(fieldName, reader), maxTokenCount + ); + } + + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + return new LimitTokenCountFilter( + delegate.reusableTokenStream(fieldName, reader), maxTokenCount + ); + } + + @Override + public int getPositionIncrementGap(String fieldName) { + return delegate.getPositionIncrementGap(fieldName); + } + + @Override + public int getOffsetGap(Fieldable field) { + return delegate.getOffsetGap(field); + } + + @Override + public String toString() { + return "LimitTokenCountAnalyzer(" + delegate.toString() + ", maxTokenCount=" + maxTokenCount + ")"; + } +} Property changes on: src\java\org\apache\lucene\analysis\LimitTokenCountAnalyzer.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/analysis/LimitTokenCountFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LimitTokenCountFilter.java (revision 0) +++ src/java/org/apache/lucene/analysis/LimitTokenCountFilter.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * This TokenFilter limits the number of tokens while indexing. It is + * a replacement for the maximum field length setting inside {@link org.apache.lucene.index.IndexWriter}. + */ +public final class LimitTokenCountFilter extends TokenFilter { + + private final int maxTokenCount; + private int tokenCount = 0; + + /** + * Build a filter that only accepts tokens up to a maximum number. + */ + public LimitTokenCountFilter(TokenStream in, int maxTokenCount) { + super(in); + this.maxTokenCount = maxTokenCount; + } + + @Override + public boolean incrementToken() throws IOException { + if (tokenCount < maxTokenCount && input.incrementToken()) { + tokenCount++; + return true; + } + return false; + } + + @Override + public void reset() throws IOException { + super.reset(); + tokenCount = 0; + } +} Property changes on: src\java\org\apache\lucene\analysis\LimitTokenCountFilter.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java =================================================================== --- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 919741) +++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy) @@ -210,6 +210,18 @@ } + public void testLimitTokenCountAnalyzer() throws IOException { + Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2); + // dont use assertAnalyzesTo here, as the end offset is not the end of the string! + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4); + assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3); + + a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2); + // dont use assertAnalyzesTo here, as the end offset is not the end of the string! + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3); + assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3); + } + /** * Test that LowercaseFilter only works on BMP for back compat, * depending upon version