Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1060461) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy) @@ -233,7 +233,10 @@ .size()]); ArrayUtil.quickSort(tokensInOriginalOrder, new Comparator() { public int compare(Token t1, Token t2) { - return t1.startOffset() - t2.endOffset(); + if(t1.startOffset() == t2.startOffset()) + return t1.endOffset() - t2.endOffset(); + else + return t1.startOffset() - t2.startOffset(); } }); } Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 0) @@ -0,0 +1,173 @@ +package org.apache.lucene.search.highlight; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.LuceneTestCase; + +public class TokenSourcesTest extends LuceneTestCase { + private static final String FIELD = "text"; + + private static final class OverlapAnalyzer extends Analyzer { + + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + return new TokenStreamOverlap(); + } + } + + private static final class TokenStreamOverlap extends TokenStream { + private Token[] tokens; + + private int i = -1; + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); + + public TokenStreamOverlap() { + reset(); + } + + @Override + public boolean incrementToken() throws IOException { + this.i++; + if (this.i >= this.tokens.length) { + return false; + } + clearAttributes(); + termAttribute.setEmpty().append(this.tokens[i]); + offsetAttribute.setOffset(this.tokens[i].startOffset(), + this.tokens[i].endOffset()); + positionIncrementAttribute.setPositionIncrement(this.tokens[i] + .getPositionIncrement()); + return true; + } + + @Override + public void reset() { + this.i = -1; + this.tokens = new Token[] { + new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3), + new Token(new char[] { '{', 'f', 'o', 'x', '}' }, 0, 5, 0, 7), + new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7), + new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11), + new Token(new char[] { 'n', 'o', 't' }, 0, 3, 12, 15), + new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) }; + this.tokens[1].setPositionIncrement(0); + } + } + + public void testOverlapWithOffset() throws CorruptIndexException, + LockObtainFailedException, IOException, InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer())); + try { + final Document document = new Document(); + document.add(new Field(FIELD, new TokenStreamOverlap(), + TermVector.WITH_OFFSETS)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = IndexReader.open(directory, true); + try { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = new IndexSearcher(indexReader); + try { + final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); + query.add(new SpanTermQuery(new Term(FIELD, "{fox}"))); + query.add(new SpanTermQuery(new Term(FIELD, "fox"))); + // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { + // new SpanTermQuery(new Term(FIELD, "{fox}")), + // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true); + + TopDocs hits = indexSearcher.search(query, 1); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(query)); + final TokenStream tokenStream = TokenSources + .getTokenStream( + (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), + false); + assertEquals("the fox did not jump", + highlighter.getBestFragment(tokenStream, TEXT)); + } finally { + indexSearcher.close(); + } + } finally { + indexReader.close(); + directory.close(); + } + } + + public void testOverlapWithPositionsAndOffset() throws CorruptIndexException, + LockObtainFailedException, IOException, InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer())); + try { + final Document document = new Document(); + document.add(new Field(FIELD, new TokenStreamOverlap(), + TermVector.WITH_POSITIONS_OFFSETS)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = IndexReader.open(directory, true); + try { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = new IndexSearcher(indexReader); + try { + final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); + query.add(new SpanTermQuery(new Term(FIELD, "{fox}"))); + query.add(new SpanTermQuery(new Term(FIELD, "fox"))); + // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { + // new SpanTermQuery(new Term(FIELD, "{fox}")), + // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true); + + TopDocs hits = indexSearcher.search(query, 1); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(query)); + final TokenStream tokenStream = TokenSources + .getTokenStream( + (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), + false); + assertEquals("the fox did not jump", + highlighter.getBestFragment(tokenStream, TEXT)); + } finally { + indexSearcher.close(); + } + } finally { + indexReader.close(); + directory.close(); + } + } + +}