Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 713975) +++ CHANGES.txt (working copy) @@ -69,6 +69,12 @@ Deprecated ConstantScoreRangeQuery (Mark Miller via Mike McCandless) + 7. LUCENE-1448: Added TokenStream.getFinalOffset(), to return the end + offset of the tokenization. This is important when multiple + fields with the same name are added to a document, to ensure + offsets recorded in term vectors for all of the instances are + correct. (Mike McCandless, Mark Miller) + Optimizations 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 713975) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SinkTokenizer; import org.apache.lucene.analysis.TokenFilter; @@ -4215,17 +4217,185 @@ Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f); + Field f2 = new Field("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f2); + doc.add(f); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); - assertEquals(2, termOffsets.length); + + // Token "" occurred once + assertEquals(1, termOffsets.length); + assertEquals(8, termOffsets[0].getStartOffset()); + assertEquals(8, termOffsets[0].getEndOffset()); + + // Token "abcd" occurred three times + termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1); + assertEquals(3, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(4, termOffsets[1].getStartOffset()); assertEquals(8, termOffsets[1].getEndOffset()); + assertEquals(8, termOffsets[2].getStartOffset()); + assertEquals(12, termOffsets[2].getEndOffset()); r.close(); dir.close(); } + + // LUCENE-1442 + public void testDoubleOffsetCounting2() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(5, termOffsets[1].getStartOffset()); + assertEquals(9, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionCharAnalyzer() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(8, termOffsets[1].getStartOffset()); + assertEquals(12, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStopFilter() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(9, termOffsets[1].getStartOffset()); + assertEquals(13, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandard() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd the ", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + Field f2 = new Field("field", "crunch man", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f2); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); + TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); + assertEquals(1, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(1); + assertEquals(11, termOffsets[0].getStartOffset()); + assertEquals(17, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(2); + assertEquals(18, termOffsets[0].getStartOffset()); + assertEquals(21, termOffsets[0].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandardEmptyField() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + Field f2 = new Field("field", "crunch man", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f2); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); + TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); + assertEquals(1, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(6, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(1); + assertEquals(7, termOffsets[0].getStartOffset()); + assertEquals(10, termOffsets[0].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandardEmptyField2() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + + Field f = new Field("field", "abcd", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + + Field f2 = new Field("field", "crunch", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f2); + + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); + TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); + assertEquals(1, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(1); + assertEquals(5, termOffsets[0].getStartOffset()); + assertEquals(11, termOffsets[0].getEndOffset()); + r.close(); + dir.close(); + } + } Index: src/test/org/apache/lucene/index/TestPayloads.java =================================================================== --- src/test/org/apache/lucene/index/TestPayloads.java (revision 713975) +++ src/test/org/apache/lucene/index/TestPayloads.java (working copy) @@ -541,6 +541,7 @@ if (!first) return null; reusableToken.reinit(term, 0, 0); reusableToken.setPayload(new Payload(payload)); + first = false; return reusableToken; } Index: src/java/org/apache/lucene/analysis/SinkTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/SinkTokenizer.java (revision 713975) +++ src/java/org/apache/lucene/analysis/SinkTokenizer.java (working copy) @@ -32,12 +32,18 @@ public class SinkTokenizer extends Tokenizer { protected List/**/ lst = new ArrayList/**/(); protected Iterator/**/ iter; + private int finalOffset = -1; public SinkTokenizer(List/**/ input) { this.lst = input; if (this.lst == null) this.lst = new ArrayList/**/(); } + public SinkTokenizer(List/**/ input, int finalOffset) { + this(input); + this.finalOffset = finalOffset; + } + public SinkTokenizer() { this.lst = new ArrayList/**/(); } @@ -104,5 +110,14 @@ public void reset() throws IOException { iter = lst.iterator(); } + + public int getFinalOffset() { + if (finalOffset != -1) + return finalOffset; + else if (lst.size() > 0) + return ((Token) lst.get(lst.size()-1)).endOffset(); + else + return 0; + } } Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 713975) +++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy) @@ -35,7 +35,8 @@ public class CachingTokenFilter extends TokenFilter { private List cache; private Iterator iterator; - + private int finalOffset; + public CachingTokenFilter(TokenStream input) { super(input); } @@ -68,6 +69,11 @@ for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { cache.add(nextToken.clone()); } + finalOffset = input.getFinalOffset(); } + public int getFinalOffset() { + return finalOffset; + } + } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 713975) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -56,6 +56,7 @@ offset += dataLen; dataLen = input.read(ioBuffer); if (dataLen == -1) { + dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) break; else @@ -88,6 +89,10 @@ return reusableToken; } + public int getFinalOffset() { + return offset; + } + public void reset(Reader input) throws IOException { super.reset(input); bufferIndex = 0; Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 713975) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -28,6 +28,7 @@ private static final int DEFAULT_BUFFER_SIZE = 256; private boolean done; + private int finalOffset; public KeywordTokenizer(Reader input) { this(input, DEFAULT_BUFFER_SIZE); @@ -55,14 +56,20 @@ reusableToken.setTermLength(upto); reusableToken.setStartOffset(0); reusableToken.setEndOffset(upto); + finalOffset = upto; return reusableToken; } return null; } + public int getFinalOffset() { + return finalOffset; + } + public void reset(Reader input) throws IOException { super.reset(input); this.done = false; + finalOffset = 0; } } Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 713975) +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -102,6 +102,10 @@ public int getMaxTokenLength() { return maxTokenLength; } + + public int getFinalOffset() { + return scanner.yychar() + scanner.yylength(); + } /** * Creates a new instance of the {@link StandardTokenizer}. Attaches the Index: src/java/org/apache/lucene/analysis/Analyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/Analyzer.java (revision 713975) +++ src/java/org/apache/lucene/analysis/Analyzer.java (working copy) @@ -20,6 +20,8 @@ import java.io.Reader; import java.io.IOException; +import org.apache.lucene.document.Fieldable; + /** An Analyzer builds TokenStreams, which analyze text. It thus represents a * policy for extracting index terms from text. *

@@ -78,4 +80,22 @@ { return 0; } + + /** + * Just like {@link #getPositionIncrementGap}, except for + * Token offsets instead. By default this returns 1 for + * tokenized fields and, as if the fields were joined + * with an extra space character, and 0 for un-tokenized + * fields. This method is only called if the field + * produced at least one token for indexing. + * + * @param Fieldable the field just indexed + * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)} + */ + public int getOffsetGap(Fieldable field) { + if (field.isTokenized()) + return 1; + else + return 0; + } } Index: src/java/org/apache/lucene/analysis/TokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 713975) +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) @@ -45,4 +45,8 @@ super.reset(); input.reset(); } + + public int getFinalOffset() { + return input.getFinalOffset(); + } } Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 713975) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -107,4 +107,18 @@ /** Releases resources associated with this stream. */ public void close() throws IOException {} + + /** Return the final offset. It's only valid to call this + * method once the stream is exhausted (i.e., {@link + * #next(Token)} has returned null). If this method + * returns -1, then the caller should fallback to the + * endOffset of the last token it saw. + * + *

Analyzer chains for fields that have multiple + * instances per document under the same field name + * should implement this method to ensure the offsets of + * all fields are correctly indexed.

*/ + public int getFinalOffset() { + return -1; + } } Index: src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 713975) +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -73,6 +73,8 @@ // tokenized. if (field.isIndexed() && doInvert) { + final boolean anyToken; + if (fieldState.length > 0) fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); @@ -91,6 +93,7 @@ fieldState.offset += valueLength; fieldState.length++; fieldState.position++; + anyToken = valueLength > 0; } else { // tokenized field final TokenStream stream; final TokenStream streamValue = field.tokenStreamValue(); @@ -119,7 +122,7 @@ // reset the TokenStream to the first token stream.reset(); - + final int startLength = fieldState.length; try { int offsetEnd = fieldState.offset-1; final Token localToken = perThread.localToken; @@ -161,12 +164,23 @@ break; } } - fieldState.offset = offsetEnd+1; + + final int finalOffset = stream.getFinalOffset(); + if (finalOffset == -1) + fieldState.offset = offsetEnd; + else + fieldState.offset += finalOffset; + + anyToken = fieldState.length > startLength; + } finally { stream.close(); } } + if (anyToken) + fieldState.offset += docState.analyzer.getOffsetGap(field); + fieldState.boost *= field.getBoost(); } } Index: contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (revision 713975) +++ contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (working copy) @@ -357,7 +357,10 @@ if (!isMatch) return null; } } - + + public int getFinalOffset() { + return str.length(); + } } @@ -437,7 +440,10 @@ private boolean isStopWord(String text) { return stopWords != null && stopWords.contains(text); } - + + public int getFinalOffset() { + return str.length(); + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 713975) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -134,4 +134,8 @@ } } + + public int getFinalOffset() { + return offset; + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 713975) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy) @@ -59,4 +59,8 @@ public void setToken(Token token) { this.token = (Token) token.clone(); } + + public int getFinalOffset() { + return token.endOffset(); + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 713975) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy) @@ -31,4 +31,7 @@ assert reusableToken != null; return null; } + public int getFinalOffset() { + return 0; + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 713975) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy) @@ -87,4 +87,8 @@ pos++; return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize); } + + public int getFinalOffset() { + return inLen; + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 713975) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -145,4 +145,8 @@ gramSize++; return reusableToken; } + + public int getFinalOffset() { + return inLen; + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 713975) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -239,4 +239,8 @@ return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType); } + + public int getFinalOffset() { + return offset; + } }