diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index d46f5c2..d456f59 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -39,10 +39,12 @@ import org.apache.lucene.search.ConstantScoreRangeQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.Spans; @@ -150,6 +152,53 @@ public class WeightedSpanTermExtractor { extract((Query) iterator.next(), disjunctTerms); } terms.putAll(disjunctTerms); + } else if (query instanceof MultiPhraseQuery) { + final MultiPhraseQuery mpq = (MultiPhraseQuery) query; + final List termArrays = mpq.getTermArrays(); + final int[] positions = mpq.getPositions(); + if (positions.length > 0) { + + int maxPosition = positions[positions.length - 1]; + for (int i = 0; i < positions.length - 1; ++i) { + if (positions[i] > maxPosition) { + maxPosition = positions[i]; + } + } + + final List[] disjunctLists = new List[maxPosition + 1]; + int distinctPositions = 0; + + for (int i = 0; i < termArrays.size(); ++i) { + final Term[] termArray = (Term[]) termArrays.get(i); + List disjuncts = disjunctLists[positions[i]]; + if (disjuncts == null) { + disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length)); + ++distinctPositions; + } + for (int j = 0; j < termArray.length; ++j) { + disjuncts.add(new SpanTermQuery(termArray[j])); + } + } + + int positionGaps = 0; + int position = 0; + final SpanQuery[] clauses = new SpanQuery[distinctPositions]; + for (int i = 0; i < disjunctLists.length; ++i) { + List disjuncts = disjunctLists[i]; + if (disjuncts != null) { + clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts.toArray(new SpanQuery[disjuncts.size()])); + } else { + ++positionGaps; + } + } + + final int slop = mpq.getSlop(); + final boolean inorder = (slop == 0); + + SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); + sp.setBoost(query.getBoost()); + extractWeightedSpanTerms(terms, sp); + } } else { // NO-OP } diff --git a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index 59179d4..a0f9a7b 100644 --- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -50,6 +50,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; @@ -238,6 +239,38 @@ public class HighlighterTest extends TestCase implements Formatter { } } + public void testSpanMultiPhraseQueryHighlighting() throws Exception { + MultiPhraseQuery mpq = new MultiPhraseQuery(); + + mpq.add(new Term[] { new Term(FIELD_NAME, "wordx"), + new Term(FIELD_NAME, "wordb") }); + mpq.add(new Term(FIELD_NAME, "wordy")); + + doSearching(mpq); + + final int maxNumFragmentsRequired = 2; + assertExpectedHighlightCount(maxNumFragmentsRequired, 6); + } + + public void testSpanMultiPhraseQueryHighlightingWithGap() throws Exception { + MultiPhraseQuery mpq = new MultiPhraseQuery(); + + /* + * The toString of MultiPhraseQuery doesn't work so well with these + * out-of-order additions, but the Query itself seems to match accurately. + */ + + mpq.add(new Term[] { new Term(FIELD_NAME, "wordz") }, 2); + mpq.add(new Term[] { new Term(FIELD_NAME, "wordx") }, 0); + + doSearching(mpq); + + final int maxNumFragmentsRequired = 1; + final int expectedHighlights = 2; + + assertExpectedHighlightCount(maxNumFragmentsRequired, expectedHighlights); + } + public void testNearSpanSimpleQuery() throws Exception { doSearching(new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "beginning")), @@ -1176,6 +1209,26 @@ public class HighlighterTest extends TestCase implements Formatter { System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits = searcher.search(query); } + + public void assertExpectedHighlightCount(final int maxNumFragmentsRequired, + final int expectedHighlights) throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer + .tokenStream(FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, new SpanScorer(query, + FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == expectedHighlights); + } + } /* * @see TestCase#setUp()