Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 713005) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SinkTokenizer; import org.apache.lucene.analysis.TokenFilter; @@ -4228,4 +4229,71 @@ r.close(); dir.close(); } + + // LUCENE-1448 + public void testEndOffsetPositionCharAnalyzer() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(7, termOffsets[1].getStartOffset()); + assertEquals(11, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStopFilter() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(7, termOffsets[1].getStartOffset()); + assertEquals(11, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + + // LUCENE-1448 + public void testEndOffsetPositionStandard() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(f); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader r = IndexReader.open(dir); + TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); + assertEquals(2, termOffsets.length); + assertEquals(0, termOffsets[0].getStartOffset()); + assertEquals(4, termOffsets[0].getEndOffset()); + assertEquals(7, termOffsets[1].getStartOffset()); + assertEquals(11, termOffsets[1].getEndOffset()); + r.close(); + dir.close(); + } + } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 713005) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -88,6 +88,10 @@ return reusableToken; } + public int getFinalOffset() { + return offset; + } + public void reset(Reader input) throws IOException { super.reset(input); bufferIndex = 0; Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 713005) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -28,6 +28,7 @@ private static final int DEFAULT_BUFFER_SIZE = 256; private boolean done; + private int finalOffset; public KeywordTokenizer(Reader input) { this(input, DEFAULT_BUFFER_SIZE); @@ -55,14 +56,20 @@ reusableToken.setTermLength(upto); reusableToken.setStartOffset(0); reusableToken.setEndOffset(upto); + finalOffset = upto; return reusableToken; } return null; } + public int getFinalOffset() { + return finalOffset; + } + public void reset(Reader input) throws IOException { super.reset(input); this.done = false; + finalOffset = 0; } } Index: src/java/org/apache/lucene/analysis/TokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 713005) +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) @@ -45,4 +45,8 @@ super.reset(); input.reset(); } + + public int getFinalOffset() { + return input.getFinalOffset(); + } } Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 713005) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -107,4 +107,18 @@ /** Releases resources associated with this stream. */ public void close() throws IOException {} + + /** Return the final offset. It's only valid to call this + * method once the stream is exhausted (i.e., {@link + * #next(Token)} has returned null). If this method + * returns -1, then the caller should fallback to the + * endOffset of the last token it saw. + * + *

Analyzer chains for fields that have multiple + * instances per document under the same field name + * should implement this method to ensure the offsets of + * all fields are correctly indexed.

*/ + public int getFinalOffset() { + return -1; + } } Index: src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 713005) +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -161,7 +161,13 @@ break; } } - fieldState.offset = offsetEnd+1; + + final int finalOffset = stream.getFinalOffset(); + if (finalOffset == -1) + fieldState.offset = offsetEnd+1; + else + fieldState.offset += finalOffset; + } finally { stream.close(); }