Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 713075) +++ CHANGES.txt (working copy) @@ -63,6 +63,12 @@ Deprecated ConstantScoreRangeQuery (Mark Miller via Mike McCandless) + 7. LUCENE-1448: Added TokenStream.getFinalOffset(), to return the end + offset of the tokenization. This is important when multiple + fields with the same name are added to a document, to ensure + offsets recorded in term vectors for all of the instances are + correct. (Mike McCandless, Mark Miller) + Optimizations 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 713075) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SinkTokenizer; import org.apache.lucene.analysis.TokenFilter; @@ -4228,4 +4230,159 @@ r.close(); dir.close(); } + + // LUCENE-1442 + public void testDoubleOffsetCounting2() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(5, termOffsets[1].getStartOffset()); + assertEquals(9, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionCharAnalyzer() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(8, termOffsets[1].getStartOffset()); + assertEquals(12, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStopFilter() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(9, termOffsets[1].getStartOffset()); + assertEquals(13, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandard() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd the ", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + Field f2 = new Field("field", "crunch man", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f2); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); + TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); + assertEquals(1, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(1); + assertEquals(11, termOffsets[0].getStartOffset()); + assertEquals(17, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(2); + assertEquals(18, termOffsets[0].getStartOffset()); + assertEquals(21, termOffsets[0].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandardEmptyField() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + Field f2 = new Field("field", "crunch man", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f2); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); + TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); + assertEquals(1, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(6, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(1); + assertEquals(7, termOffsets[0].getStartOffset()); + assertEquals(10, termOffsets[0].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandardEmptyField2() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + + Field f = new Field("field", "abcd", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + + Field f2 = new Field("field", "crunch", Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f2); + + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); + TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); + assertEquals(1, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + termOffsets = tpv.getOffsets(1); + assertEquals(5, termOffsets[0].getStartOffset()); + assertEquals(11, termOffsets[0].getEndOffset()); + r.close(); + dir.close(); + } + } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 713075) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -56,6 +56,7 @@ offset += dataLen; dataLen = input.read(ioBuffer); if (dataLen == -1) { + dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) break; else @@ -88,6 +89,10 @@ return reusableToken; } + public int getFinalOffset() { + return offset; + } + public void reset(Reader input) throws IOException { super.reset(input); bufferIndex = 0; Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 713075) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -28,6 +28,7 @@ private static final int DEFAULT_BUFFER_SIZE = 256; private boolean done; + private int finalOffset; public KeywordTokenizer(Reader input) { this(input, DEFAULT_BUFFER_SIZE); @@ -55,14 +56,20 @@ reusableToken.setTermLength(upto); reusableToken.setStartOffset(0); reusableToken.setEndOffset(upto); + finalOffset = upto; return reusableToken; } return null; } + public int getFinalOffset() { + return finalOffset; + } + public void reset(Reader input) throws IOException { super.reset(input); this.done = false; + finalOffset = 0; } } Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 713075) +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -102,6 +102,10 @@ public int getMaxTokenLength() { return maxTokenLength; } + + public int getFinalOffset() { + return scanner.yychar() + scanner.yylength(); + } /** * Creates a new instance of the {@link StandardTokenizer}. Attaches the Index: src/java/org/apache/lucene/analysis/Analyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/Analyzer.java (revision 713075) +++ src/java/org/apache/lucene/analysis/Analyzer.java (working copy) @@ -20,6 +20,8 @@ import java.io.Reader; import java.io.IOException; +import org.apache.lucene.document.Fieldable; + /** An Analyzer builds TokenStreams, which analyze text. It thus represents a * policy for extracting index terms from text. *

@@ -78,4 +80,22 @@ { return 0; } + + /** + * Just like {@link #getPositionIncrementGap}, except for + * Token offsets instead. By default this returns 1 for + * tokenized fields and, as if the fields were joined + * with an extra space character, and 0 for un-tokenized + * fields. This method is only called if the field + * produced at least one token for indexing. + * + * @param Fieldable the field just indexed + * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)} + */ + public int getOffsetGap(Fieldable field) { + if (field.isTokenized()) + return 1; + else + return 0; + } } Index: src/java/org/apache/lucene/analysis/TokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 713075) +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) @@ -45,4 +45,8 @@ super.reset(); input.reset(); } + + public int getFinalOffset() { + return input.getFinalOffset(); + } } Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 713075) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -107,4 +107,18 @@ /** Releases resources associated with this stream. */ public void close() throws IOException {} + + /** Return the final offset. It's only valid to call this + * method once the stream is exhausted (i.e., {@link + * #next(Token)} has returned null). If this method + * returns -1, then the caller should fallback to the + * endOffset of the last token it saw. + * + *

Analyzer chains for fields that have multiple + * instances per document under the same field name + * should implement this method to ensure the offsets of + * all fields are correctly indexed.

*/ + public int getFinalOffset() { + return -1; + } } Index: src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 713075) +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -71,8 +71,11 @@ // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. + if (field.isIndexed() && doInvert) { + final boolean anyToken; + if (fieldState.length > 0) fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); @@ -91,6 +94,7 @@ fieldState.offset += valueLength; fieldState.length++; fieldState.position++; + anyToken = valueLength > 0; } else { // tokenized field final TokenStream stream; final TokenStream streamValue = field.tokenStreamValue(); @@ -119,7 +123,7 @@ // reset the TokenStream to the first token stream.reset(); - + final int startLength = fieldState.length; try { int offsetEnd = fieldState.offset-1; final Token localToken = perThread.localToken; @@ -134,6 +138,7 @@ Token token = stream.next(localToken); if (token == null) break; + final int posIncr = token.getPositionIncrement(); fieldState.position += posIncr - 1; if (posIncr == 0) @@ -161,12 +166,23 @@ break; } } - fieldState.offset = offsetEnd+1; + + final int finalOffset = stream.getFinalOffset(); + if (finalOffset == -1) + fieldState.offset = offsetEnd; + else + fieldState.offset += finalOffset; + + anyToken = fieldState.length > startLength; + } finally { stream.close(); } } + if (anyToken) + fieldState.offset += docState.analyzer.getOffsetGap(field); + fieldState.boost *= field.getBoost(); } }