Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1464648) +++ lucene/CHANGES.txt (working copy) @@ -209,6 +209,10 @@ with target<=current (in this case the behavior of advance is undefined). (Adrien Grand) +* LUCENE-4899: FastVectorHighlihgter failed with StringIndexOutOfBoundsException + if a single highlight phrase or term was greater than the fragCharSize producing + negative string offsets. (Simon Willnauer) + Documentation * LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (revision 1464648) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (working copy) @@ -29,18 +29,19 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.highlight.SimpleSpanFragmenter; -import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; public class FastVectorHighlighterTest extends LuceneTestCase { + public void testSimpleHighlightTest() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); @@ -71,6 +72,135 @@ dir.close(); } + + // see LUCENE-4899 + public void testPhraseHighlightTest() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + Document doc = new Document(); + FieldType type = new FieldType(TextField.TYPE_STORED); + type.setStoreTermVectorOffsets(true); + type.setStoreTermVectorPositions(true); + type.setStoreTermVectors(true); + type.freeze(); + Field longTermField = new Field("long_term", "This is a test thisisaverylongwordandmakessurethisfails where foo is highlighed and should be highlighted", type); + Field noLongTermField = new Field("no_long_term", "This is a test where foo is highlighed and should be highlighted", type); + + doc.add(longTermField); + doc.add(noLongTermField); + writer.addDocument(doc); + FastVectorHighlighter highlighter = new FastVectorHighlighter(); + IndexReader reader = DirectoryReader.open(writer, true); + int docId = 0; + String field = "no_long_term"; + { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(field, "test")), Occur.MUST); + query.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("foo is highlighed and", bestFragments[0]); + } + { + BooleanQuery query = new BooleanQuery(); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(field, "test")); + pq.add(new Term(field, "foo")); + pq.add(new Term(field, "highlighed")); + pq.setSlop(5); + query.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(pq, Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(0, bestFragments.length); + bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 30, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("a test where foo is highlighed and", bestFragments[0]); + + } + { + PhraseQuery query = new PhraseQuery(); + query.add(new Term(field, "test")); + query.add(new Term(field, "foo")); + query.add(new Term(field, "highlighed")); + query.setSlop(3); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(0, bestFragments.length); + bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 30, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("a test where foo is highlighed and", bestFragments[0]); + + } + { + PhraseQuery query = new PhraseQuery(); + query.add(new Term(field, "test")); + query.add(new Term(field, "foo")); + query.add(new Term(field, "highlighted")); + query.setSlop(30); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + assertEquals(0, bestFragments.length); + } + { + BooleanQuery query = new BooleanQuery(); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(field, "test")); + pq.add(new Term(field, "foo")); + pq.add(new Term(field, "highlighed")); + pq.setSlop(5); + BooleanQuery inner = new BooleanQuery(); + inner.add(pq, Occur.MUST); + inner.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(inner, Occur.MUST); + query.add(pq, Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + assertEquals(0, bestFragments.length); + + bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 30, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("a test where foo is highlighed and", bestFragments[0]); + } + + field = "long_term"; + { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(field, + "thisisaverylongwordandmakessurethisfails")), Occur.MUST); + query.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("thisisaverylongwordandmakessurethisfails", + bestFragments[0]); + } + reader.close(); + writer.close(); + dir.close(); + } + public void testCommonTermsQueryHighlightTest() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java (revision 1464648) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java (working copy) @@ -42,7 +42,7 @@ SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "abcdefghijklmnopqrs")), "abcdefghijklmnopqrs" ), sflb.minFragCharSize ); assertEquals( 1, ffl.getFragInfos().size() ); - assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,18)", ffl.getFragInfos().get( 0 ).toString() ); + assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,19)", ffl.getFragInfos().get( 0 ).toString() ); } public void testSmallerFragSizeThanPhraseQuery() throws Exception { @@ -55,7 +55,7 @@ FieldFragList ffl = sflb.createFieldFragList( fpl(phraseQuery, "abcdefgh jklmnopqrs" ), sflb.minFragCharSize ); assertEquals( 1, ffl.getFragInfos().size() ); if (VERBOSE) System.out.println( ffl.getFragInfos().get( 0 ).toString() ); - assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(1,19)", ffl.getFragInfos().get( 0 ).toString() ); + assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(0,21)", ffl.getFragInfos().get( 0 ).toString() ); } public void test1TermIndex() throws Exception { Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java (revision 1464648) +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java (working copy) @@ -46,63 +46,98 @@ this( MARGIN_DEFAULT ); } - protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ - + protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ if( fragCharSize < minFragCharSize ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); List wpil = new ArrayList(); - Iterator ite = fieldPhraseList.getPhraseList().iterator(); + IteratorQueue queue = new IteratorQueue(fieldPhraseList.getPhraseList().iterator()); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; - boolean taken = false; - while( true ){ - if( !taken ){ - if( !ite.hasNext() ) break; - phraseInfo = ite.next(); + while((phraseInfo = queue.top()) != null){ + // if the phrase violates the border of previous fragment, discard it and try next phrase + if( phraseInfo.getStartOffset() < startOffset ) { + queue.removeTop(); + continue; } - taken = false; - if( phraseInfo == null ) break; - - // if the phrase violates the border of previous fragment, discard it and try next phrase - if( phraseInfo.getStartOffset() < startOffset ) continue; - + wpil.clear(); - wpil.add( phraseInfo ); - int firstOffset = phraseInfo.getStartOffset(); - int st = phraseInfo.getStartOffset() - margin < startOffset ? - startOffset : phraseInfo.getStartOffset() - margin; - int en = st + fragCharSize; - if( phraseInfo.getEndOffset() > en ) - en = phraseInfo.getEndOffset(); - - int lastEndOffset = phraseInfo.getEndOffset(); - while( true ){ - if( ite.hasNext() ){ - phraseInfo = ite.next(); - taken = true; - if( phraseInfo == null ) break; - } - else + final int currentPhraseStartOffset = phraseInfo.getStartOffset(); + int currentPhraseEndOffset = phraseInfo.getEndOffset(); + int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); + int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); + } + while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd + if (phraseInfo.getEndOffset() <= spanEnd) { + currentPhraseEndOffset = phraseInfo.getEndOffset(); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); + } + } else { break; - if( phraseInfo.getEndOffset() <= en ){ - wpil.add( phraseInfo ); - lastEndOffset = phraseInfo.getEndOffset(); } - else - break; } - int matchLen = lastEndOffset - firstOffset; - //now recalculate the start and end position to "center" the result - int newMargin = (fragCharSize-matchLen)/2; - st = firstOffset - newMargin; - if(st fragCharSize prevent IAOOB here + spanStart = currentPhraseStartOffset - newMargin; + if (spanStart < startOffset) { + spanStart = startOffset; + } + // whatever is bigger here we grow this out + spanEnd = spanStart + Math.max(matchLen, fragCharSize); + startOffset = spanEnd; + fieldFragList.add(spanStart, spanEnd, wpil); } return fieldFragList; } + + /** + * A predicate to decide if the given {@link WeightedPhraseInfo} should be + * accepted as a highlighted phrase or if it should be discarded. + *

+ * The default implementation discards phrases that are composed of more than one term + * and where the matchLength exceeds the fragment character size. + * + * @param info the phrase info to accept + * @param matchLength the match length of the current phrase + * @param fragCharSize the configured fragment character size + * @return true if this phrase info should be accepted as a highligh phrase + */ + protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) { + return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize; + } + + private static final class IteratorQueue { + private final Iterator iter; + private T top; + + public IteratorQueue(Iterator iter) { + this.iter = iter; + T removeTop = removeTop(); + assert removeTop == null; + } + + public T top() { + return top; + } + + public T removeTop() { + T currentTop = top; + if (iter.hasNext()) { + top = iter.next(); + } else { + top = null; + } + return currentTop; + } + + } + } Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 1464648) +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy) @@ -81,7 +81,7 @@ if( ti != null ) nextMap = currMap.getTermMap( ti.getText() ); if( ti == null || nextMap == null ){ - if( ti != null ) + if( ti != null ) fieldTermStack.push( ti ); if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );