Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java (revision 1144713) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java (working copy) @@ -7,6 +7,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer.ScorerVisitor; +import org.apache.lucene.search.positions.PositionIntervalIterator; /** * for testing only - collect the first maxDocs docs and throw the rest away @@ -21,14 +22,41 @@ } protected Scorer scorer; - + + @Override public void collect(int doc) throws IOException { if (count >= docs.length) return; - assert (scorer != null); - ScorePosDoc spdoc = new ScorePosDoc (doc, scorer.score(), scorer.positions(), 32, false); - docs[count++] = spdoc; + addDoc (doc); + // consume any remaining positions the scorer didn't report + // NO: we need to rely on the scorer here + // scorer.setPositionCollector(null); + // docs[count-1].storePositions(scorer.positions()); + docs[count-1].score=scorer.score(); + /* + while (scorer.positions().next() != null) + ; + */ } + + private boolean addDoc (int doc) { + if (count <= 0 || docs[count-1].doc != doc) { + ScorePosDoc spdoc = new ScorePosDoc (doc); + docs[count++] = spdoc; + return true; + } + return false; + } + + public void collectPositions(Scorer scorer, PositionIntervalIterator positions) throws IOException { + int docID = scorer.docID(); + if (docID == Scorer.NO_MORE_DOCS) + docID = docs[count-1].doc; + else + addDoc (docID); + System.out.println ("collectPositions for doc=" + docID + " from " + scorer); + docs[count-1].storePositions(positions); + } public boolean acceptsDocsOutOfOrder() { return false; @@ -36,6 +64,8 @@ public void setScorer(Scorer scorer) { this.scorer = scorer; + scorer.setPositionCollector(this); + // If we want to visit the other scorers, we can, here... } public Scorer getScorer () { @@ -71,4 +101,5 @@ System.out.println ("parent=" + parent + ", child=" + child); } } + } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java (revision 1144713) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java (working copy) @@ -3,17 +3,15 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermVectorMapper; -import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; +/* + */ public class PosHighlighter { private Formatter formatter; private Encoder encoder; @@ -67,14 +65,16 @@ int matchEnd = pom.getEndOffset(pos.end); int fragStart = Math.max(0, matchStart - (fragSize - (matchEnd-matchStart)) / 2); int fragEnd = Math.min(fragStart+fragSize, text.length()); - + // FIXME - non-initial fragments fail to highlight matches occurring between fragStart + // and matchStart + // FIXME - also; fragments should not overlap for (;;) { // Build up a single fragment, possibly including multiple positions if (matchStart > fragStart) buf.append (text, fragStart, matchStart); - buf.append (""); // TODO - parameterize + buf.append (""); // TODO - parameterize buf.append (text, matchStart, matchEnd); - buf.append (""); + buf.append (""); if (fragEnd <= matchEnd) { break; } @@ -120,47 +120,4 @@ // TODO - get maxNumFragments top fragments by score return null; } - - class PositionOffsetMapper extends TermVectorMapper { - private int maxPos = 0; - private static final int BUF_SIZE = 128; - int startOffset[] = new int[BUF_SIZE], endOffset[] = new int[BUF_SIZE]; - - public void setExpectations(String field, int numTerms, - boolean storeOffsets, boolean storePositions) { - } - - public void map(BytesRef term, int frequency, - TermVectorOffsetInfo[] offsets, int[] positions) - { - for (int i = 0; i < positions.length; i++) { - int pos = positions[i]; - if (pos >= startOffset.length) { - grow (pos + BUF_SIZE); - maxPos = pos; - } else if (pos > maxPos) { - maxPos = pos; - } - startOffset[pos] = offsets[i].getStartOffset(); - endOffset[pos] = offsets[i].getEndOffset(); - } - } - - private void grow (int size) { - startOffset = ArrayUtil.grow (startOffset, size); - endOffset = ArrayUtil.grow (endOffset, size); - } - - public int getStartOffset(int pos) { - return startOffset[pos]; - } - - public int getEndOffset(int pos) { - return endOffset[pos]; - } - - public int getMaxPosition() { - return maxPos; - } - } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 0) @@ -0,0 +1,58 @@ +package org.apache.lucene.search.poshighlight; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.search.positions.PositionIntervalIterator; +import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; + +/** + * A TokenStream built from a String and predetermined PositionIntervals. + * The document is segmented into tokens within and between the intervals. The intervals + * are assumed to be non-overlapping. + * + * Maybe this should be built using a PositionIntervalIterator instead? + */ +public class PosTokenStream extends TokenStream { + + //this tokenizer generates four attributes: + // term, offset, positionIncrement? and type? + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + //private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final String text; + private final PositionIntervalIterator positions; + + // the index of the current position interval + private PositionInterval pos = null; + // whether we are currently at the start of an interval, or if false, the end. + private boolean atBegin = false; + private final PositionOffsetMapper pom; + + public PosTokenStream (String text, PositionIntervalIterator positions, PositionOffsetMapper pom) { + this.text = text; + this.positions = positions; + this.pom = pom; + } + + @Override + public final boolean incrementToken() throws IOException { + pos = positions.next(); + if (pos == null){ + return false; + } + int b, e; + b = pom.getStartOffset(pos.begin); + e = pom.getEndOffset(pos.end); + termAtt.append(text, b, e); + offsetAtt.setOffset(b, e); + posIncrAtt.setPositionIncrement(1); + atBegin = !atBegin; + return true; + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java (revision 0) @@ -0,0 +1,35 @@ +package org.apache.lucene.search.poshighlight; + +import org.apache.lucene.search.positions.PositionIntervalIterator; + +public class PositionIntervalArrayIterator extends PositionIntervalIterator { + + private int next = 0; + private int count; + private PositionInterval[] positions; + + public PositionIntervalArrayIterator (PositionInterval[] positions, int count) { + super(null); + this.positions = positions; + this.count = count; + } + + @Override + public PositionInterval next() { + if (next >= count) + return null; + return positions[next++]; + } + + public PositionInterval current() { + if (next < count) + return positions[next]; + return null; + } + + @Override + public PositionIntervalIterator[] subs(boolean inOrder) { + return null; + } + +} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java (revision 0) @@ -0,0 +1,49 @@ +package org.apache.lucene.search.poshighlight; + +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +public class PositionOffsetMapper extends TermVectorMapper { + private int maxPos = 0; + private static final int BUF_SIZE = 128; + int startOffset[] = new int[BUF_SIZE], endOffset[] = new int[BUF_SIZE]; + + public void setExpectations(String field, int numTerms, + boolean storeOffsets, boolean storePositions) { + } + + public void map(BytesRef term, int frequency, + TermVectorOffsetInfo[] offsets, int[] positions) + { + for (int i = 0; i < positions.length; i++) { + int pos = positions[i]; + if (pos >= startOffset.length) { + grow (pos + BUF_SIZE); + maxPos = pos; + } else if (pos > maxPos) { + maxPos = pos; + } + startOffset[pos] = offsets[i].getStartOffset(); + endOffset[pos] = offsets[i].getEndOffset(); + } + } + + private void grow (int size) { + startOffset = ArrayUtil.grow (startOffset, size); + endOffset = ArrayUtil.grow (endOffset, size); + } + + public int getStartOffset(int pos) { + return startOffset[pos]; + } + + public int getEndOffset(int pos) { + return endOffset[pos]; + } + + public int getMaxPosition() { + return maxPos; + } +} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionTreeIterator.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionTreeIterator.java (revision 1144713) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionTreeIterator.java (working copy) @@ -3,10 +3,11 @@ import java.io.IOException; import org.apache.lucene.search.positions.PositionIntervalIterator; -import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; +// FIXME - unused - delete soon + // retrieves the positions from the leaves of a tree of PositionIntervalIterators -public class PositionTreeIterator { +public class PositionTreeIterator extends PositionIntervalIterator { static class Frame { Frame (PositionIntervalIterator positions) { @@ -25,18 +26,22 @@ int curframe = 0; public PositionTreeIterator (PositionIntervalIterator root) { + super(root.getScorer()); stack[0] = new Frame(root); } + /* without consuming... public PositionInterval next() throws IOException { PositionInterval pos; if (curframe < 0) return null; Frame f = stack[curframe]; if (f.subs == null) { - pos = stack[curframe].positions.next(); - if (pos != null) + pos = f.positions.current(); + if (pos != null) { + --curframe; return pos; + } } else if (f.isub < f.subs.length) { if (curframe >= stack.length) { @@ -49,4 +54,37 @@ --curframe; return next(); } + */ + /* advances the wrapped iterators */ + public PositionInterval next() throws IOException { + PositionInterval pos; + if (curframe < 0) + return null; + Frame f = stack[curframe]; + if (f.subs == null) { + pos = f.positions.next(); + if (pos != null) + return pos; + } + else if (f.isub < f.subs.length) { + if (curframe >= stack.length) { + throw new ArrayIndexOutOfBoundsException ("PositionTreeIterator stack depth > 32"); + } + stack[++curframe] = new Frame (f.subs[f.isub++]); + return next(); + } + // pop + --curframe; + return next(); + } + + public PositionInterval current() { + // implement this if we ever need this class again? + return null; + } + + @Override + public PositionIntervalIterator[] subs(boolean inOrder) { + return null; + } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java (revision 1144713) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java (working copy) @@ -8,30 +8,40 @@ import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; import org.apache.lucene.util.ArrayUtil; -/** Used to accumulate span positions while scoring */ +/** Used to accumulate position intervals while scoring */ public class ScorePosDoc extends ScoreDoc { public int posCount = 0; public PositionInterval[] positions; - public ScorePosDoc(int doc, float score, PositionIntervalIterator posIter, int maxPositions, boolean orderByScore) throws IOException - { - super(doc, score); - assert doc == posIter.docID(); + public ScorePosDoc(int doc) { + super(doc, 0); positions = new PositionInterval[32]; - storePositions (new PositionTreeIterator (posIter), maxPositions, orderByScore); } - private void storePositions(PositionTreeIterator ptree, - int maxPositions, boolean orderByScore) throws IOException { - - for (PositionInterval pos = ptree.next(); pos != null; pos = ptree.next()) { - if (posCount >= positions.length) { - PositionInterval temp[] = new PositionInterval[positions.length * 2]; - System.arraycopy(positions, 0, temp, 0, positions.length); - } - positions[posCount++] = (PositionInterval) pos.clone(); + public void storePosition (PositionInterval pos) { + ensureStorage(); + positions[posCount++] = (PositionInterval) pos.clone(); + } + + private void ensureStorage () { + if (posCount >= positions.length) { + PositionInterval temp[] = new PositionInterval[positions.length * 2]; + System.arraycopy(positions, 0, temp, 0, positions.length); + positions = temp; } + } + + public void storePositions (PositionIntervalIterator posIter) throws IOException { + // storePositions (new PositionTreeIterator (posIter)); + PositionInterval pos; + while ((pos = posIter.next()) != null) { + System.out.println (pos); + storePosition (pos); + } + } + + public PositionInterval[] sortedPositions() { ArrayUtil.mergeSort(positions, 0, posCount, new Comparator() { public int compare(PositionInterval o1, PositionInterval o2) { return @@ -43,6 +53,6 @@ } }); + return positions; } - } Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java (revision 1144713) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java (working copy) @@ -1,14 +1,11 @@ package org.apache.lucene.search.poshighlight; -import java.io.BufferedReader; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; @@ -21,35 +18,27 @@ import org.apache.lucene.index.codecs.CoreCodecProvider; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.poshighlight.PosCollector; -import org.apache.lucene.search.poshighlight.PosHighlighter; -import org.apache.lucene.search.positions.OrderedConjunctionPositionIterator; -import org.apache.lucene.search.positions.PositionIntervalIterator; -import org.apache.lucene.search.positions.WithinPositionIterator; -import org.apache.lucene.search.positions.PositionIntervalIterator.PositionIntervalFilter; -import org.apache.lucene.search.spans.MockSpanQuery; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; +import org.apache.lucene.search.highlight.SimpleFragmenter; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.search.positions.PositionFilterQuery; +import org.apache.lucene.search.positions.TestBlockPositionsIterator.BlockPositionIteratorFilter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.LuceneTestCase; -import org.junit.Ignore; + /** - * Notes: to fully implement, we need: - * 1) ability to walk the individual terms that matched, possibly in a hierarchical way - * if we want to implement really clever highlighting? - * 2) some Collector api like the one I made up, and support in Searcher - * 3) All (or more) queries implemented - * - * For hl perf testing we could test term queries only using the current impl - * @author sokolov - * + * TODO: + * Phrase and Span Queries + * positions callback API */ public class PosHighlighterTest extends LuceneTestCase { @@ -57,8 +46,12 @@ protected Analyzer analyzer; protected QueryParser parser; protected Directory dir; - protected IndexSearcher searcher; + protected IndexSearcher searcher; + private static final String PORRIDGE_VERSE = + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!"; + @Override public void setUp() throws Exception { super.setUp(); @@ -99,28 +92,76 @@ searcher = new IndexSearcher( dir, true ); } - private String[] doSearch(Query q) throws IOException { + private String[] doSearch(Query q) throws IOException, InvalidTokenOffsetsException { return doSearch(q, 100); } - private String[] doSearch(Query q, int maxFragSize) throws IOException { - PosHighlighter ph = new PosHighlighter(); + private class ConstantScorer implements org.apache.lucene.search.highlight.Scorer { + + @Override + public TokenStream init(TokenStream tokenStream) throws IOException { + return tokenStream; + } + + @Override + public void startFragment(TextFragment newFragment) { + } + + @Override + public float getTokenScore() { + return 1; + } + + @Override + public float getFragmentScore() { + return 1; + } + } + + private String[] doSearch(Query q, int maxFragSize) throws IOException, InvalidTokenOffsetsException { + return doSearch (q, maxFragSize, 0); + } + + private String[] doSearch(Query q, int maxFragSize, int docIndex) throws IOException, InvalidTokenOffsetsException { + //PosHighlighter ph = new PosHighlighter(); + // ConstantScorer is a fragment Scorer, not a search result (document) Scorer + Highlighter highlighter = new Highlighter (new ConstantScorer()); + highlighter.setTextFragmenter(new SimpleFragmenter(maxFragSize)); PosCollector collector = new PosCollector (10); + if (q instanceof MultiTermQuery) { + ((MultiTermQuery)q).setRewriteMethod (MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + q = q.rewrite(searcher.getIndexReader()); + } searcher.search(q, collector); - return ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, maxFragSize); + //return ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, maxFragSize); + ScorePosDoc doc = collector.docs[docIndex]; + if (doc == null) + return null; + String text = searcher.getIndexReader().document(doc.doc).getFieldable(F).stringValue(); + PositionOffsetMapper pom = new PositionOffsetMapper (); + // FIXME: test error cases: for non-stored fields, and fields w/no term vectors + searcher.getIndexReader().getTermFreqVector(doc.doc, F, pom); + + TextFragment[] fragTexts = highlighter.getBestTextFragments(new PosTokenStream + (text, new PositionIntervalArrayIterator(doc.sortedPositions(), doc.posCount), pom), + text, false, 10); + String[] frags = new String[fragTexts.length]; + for (int i = 0; i < frags.length; i++) + frags[i] = fragTexts[i].toString(); + return frags; } public void testTerm () throws Exception { - insertDocs(analyzer, "This is a test"); + insertDocs(analyzer, "This is a test test"); String frags[] = doSearch (new TermQuery(new Term(F, "test"))); - assertEquals ("This is a test", frags[0]); + assertEquals ("This is a test test", frags[0]); } public void testSeveralSnippets () throws Exception { String input = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + "Let us see what happens to long in this case."; - String gold = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + - "Let us see what happens to long in this case."; + String gold = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + "Let us see what happens to long in this case."; insertDocs(analyzer, input); String frags[] = doSearch (new TermQuery(new Term(F, "long")), input.length()); assertEquals (gold, frags[0]); @@ -132,7 +173,7 @@ bq.add(new BooleanClause (new TermQuery(new Term(F, "This")), Occur.MUST)); bq.add(new BooleanClause (new TermQuery(new Term(F, "test")), Occur.MUST)); String frags[] = doSearch (bq); - assertEquals ("This is a test", frags[0]); + assertEquals ("This is a test", frags[0]); } public void testBooleanAndOtherOrder () throws Exception { @@ -141,41 +182,57 @@ bq.add(new BooleanClause (new TermQuery(new Term(F, "test")), Occur.MUST)); bq.add(new BooleanClause (new TermQuery(new Term(F, "This")), Occur.MUST)); String frags[] = doSearch (bq); - assertEquals ("This is a test", frags[0]); + assertEquals ("This is a test", frags[0]); } - + public void testBooleanOr () throws Exception { - // OR queries not implemented yet... insertDocs(analyzer, "This is a test"); BooleanQuery bq = new BooleanQuery(); bq.add(new BooleanClause (new TermQuery(new Term(F, "test")), Occur.SHOULD)); bq.add(new BooleanClause (new TermQuery(new Term(F, "This")), Occur.SHOULD)); String frags[] = doSearch (bq); - assertEquals ("This is a test", frags[0]); + assertEquals ("This is a test", frags[0]); } - @Ignore("not supproted yet") + public void testBooleanNrShouldMatch () throws Exception { + insertDocs(analyzer, "a b c d e f g h i"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "a")), Occur.SHOULD)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "b")), Occur.SHOULD)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "no")), Occur.SHOULD)); + + // This generates a ConjunctionSumScorer + bq.setMinimumNumberShouldMatch(2); + String frags[] = doSearch (bq); + assertEquals ("a b c d e f g h i", frags[0]); + + // This generates no scorer + bq.setMinimumNumberShouldMatch(3); + frags = doSearch (bq); + assertNull (frags); + + // This generates a DisjunctionSumScorer + bq.setMinimumNumberShouldMatch(2); + bq.add(new BooleanClause (new TermQuery(new Term(F, "c")), Occur.SHOULD)); + frags = doSearch (bq); + assertEquals ("a b c d e f g h i", frags[0]); + } + public void testPhrase() throws Exception { - insertDocs(analyzer, "This is a test"); + insertDocs(analyzer, "is it that this is a test, is it"); BooleanQuery bq = new BooleanQuery(); bq.add(new BooleanClause (new TermQuery(new Term(F, "is")), Occur.MUST)); bq.add(new BooleanClause (new TermQuery(new Term(F, "a")), Occur.MUST)); - MockSpanQuery msq = new MockSpanQuery(bq, false, F, new Filter(1)); - String frags[] = doSearch (msq); - assertEquals ("This is a test", frags[0]); + PositionFilterQuery pfq = new PositionFilterQuery(bq, new BlockPositionIteratorFilter()); + String frags[] = doSearch (pfq); + // make sure we highlight the phrase, and not the terms outside the phrase + assertEquals ("is it that this is a test, is it", frags[0]); } - public static class Filter implements PositionIntervalFilter { - private int slop; - public Filter(int slop) { - this.slop = slop; - } - @Override - public PositionIntervalIterator filter(PositionIntervalIterator iter) { - return new WithinPositionIterator(slop, new OrderedConjunctionPositionIterator(iter)); - } - } - @Ignore("not supproted yet") + /* + * Failing ... PhraseQuery scorer needs positions()? + */ + /* public void testPhraseOriginal() throws Exception { insertDocs(analyzer, "This is a test"); PhraseQuery pq = new PhraseQuery(); @@ -183,44 +240,46 @@ pq.add(new Term(F, "test")); String frags[] = doSearch (pq); //searcher.search(new MockSpanQuery(pq, collector.needsPayloads(), F, null), collector); - assertEquals ("This is a test", frags[0]); + assertEquals ("This is a test", frags[0]); } + */ public void testWildcard () throws Exception { insertDocs(analyzer, "This is a test"); - WildcardQuery wildcardQuery = new WildcardQuery(new Term(F, "t*t")); - // TODO enable positions in constant scorer - wildcardQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - String frags[] = doSearch(wildcardQuery); - assertEquals ("This is a test", frags[0]); + String frags[] = doSearch (new WildcardQuery(new Term(F, "t*t"))); + assertEquals ("This is a test", frags[0]); } -// -// @Ignore("file epistolary-novel.xml does not exist") -// public void testLargerDocument() throws Exception { -// InputStream in = new FileInputStream ("epistolary-novel.xml"); -// insertDocs(analyzer, IOUtils.toString(in)); -// in.close(); -// BooleanQuery bq = new BooleanQuery(); -// bq.add(new BooleanClause (new TermQuery(new Term(F, "unknown")), Occur.MUST)); -// bq.add(new BooleanClause (new TermQuery(new Term(F, "artist")), Occur.MUST)); -// String frags[] = doSearch (bq, 50); -// assertEquals ("is a narration by an unknown observer.\n*[[Jean Web", frags[0]); -// assertEquals ("fin and Sabine]]'' by artist [[Nick Bantock]] is a", frags[1]); -// } -// @Ignore("file epistolary-novel.xml does not exist") -// public void testMultipleDocuments() throws Exception { -// InputStream in = new FileInputStream ("epistolary-novel.xml"); -// insertDocs(analyzer, -// "This document has no matches", -// IOUtils.toString(in), -// "This document has an unknown artist match"); -// BooleanQuery bq = new BooleanQuery(); -// bq.add(new BooleanClause (new TermQuery(new Term(F, "unknown")), Occur.MUST)); -// bq.add(new BooleanClause (new TermQuery(new Term(F, "artist")), Occur.MUST)); -// String frags[] = doSearch (bq, 50); -// assertEquals ("is a narration by an unknown observer.\n*[[Jean Web", frags[0]); -// assertEquals ("fin and Sabine]]'' by artist [[Nick Bantock]] is a", frags[1]); -// } + public void testMultipleDocumentsAnd() throws Exception { + insertDocs(analyzer, + "This document has no matches", + PORRIDGE_VERSE, + "This document has some Pease porridge in it"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.MUST)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.MUST)); + String frags[] = doSearch (bq, 50, 0); + assertEquals ("Pease porridge hot! Pease porridge cold! Pease", frags[0]); + frags = doSearch (bq, 50, 1); + assertEquals ("This document has some Pease porridge in it", frags[0]); + } + + /* + * Failing: need positions callback API since DisjunctionSumScorer consumes all of a doc's + * positions before passing the doc to the collector. + */ + public void testMultipleDocumentsOr() throws Exception { + insertDocs(analyzer, + "This document has no matches", + PORRIDGE_VERSE, + "This document has some Pease porridge in it"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.SHOULD)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.SHOULD)); + String frags[] = doSearch (bq, 50, 0); + assertEquals ("Pease porridge hot! Pease porridge cold! Pease", frags[0]); + frags = doSearch (bq, 50, 1); + assertEquals ("This document has some Pease porridge in it", frags[0]); + } -} \ No newline at end of file +} Index: lucene/src/java/org/apache/lucene/search/BooleanScorer2.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanScorer2.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/BooleanScorer2.java (working copy) @@ -276,7 +276,7 @@ */ @Override public void score(Collector collector) throws IOException { - collector.setScorer(this); + collector.setScorer(this); while ((doc = countingSumScorer.nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); } @@ -320,7 +320,10 @@ return doc = countingSumScorer.advance(target); } - + @Override + public void setPositionCollector (Collector c) { + countingSumScorer.setPositionCollector(c); + } @Override public PositionIntervalIterator positions() throws IOException { Index: lucene/src/java/org/apache/lucene/search/Collector.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Collector.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/Collector.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.search.positions.PositionIntervalIterator; /** *

Expert: Collectors are primarily meant to be used to @@ -177,4 +178,6 @@ public boolean needsPayloads() { return false; } + public void collectPositions(Scorer scorer, PositionIntervalIterator positions) throws IOException {} + } Index: lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java (working copy) @@ -32,6 +32,7 @@ private final Scorer[] scorers; private final float coord; private int lastDoc = -1; + private PositionIntervalIterator positions; public ConjunctionScorer(Weight weight, float coord, boolean needsPositions, Collection scorers) throws IOException { this(weight, coord, needsPositions, scorers.toArray(new Scorer[scorers.size()])); @@ -146,13 +147,22 @@ } return sum * coord; } - + + public void setPositionCollector (Collector c) { + for (Scorer scorer : scorers) { + scorer.setPositionCollector(c); + } + } + @Override public PositionIntervalIterator positions() throws IOException { - if (scorersOrdered == null) - throw new IllegalStateException("no positions requested for this scorer"); - // only created if needed for this scorer - no penalty for non-positional queries - return new ConjunctionPositionIterator(this, scorersOrdered); + if (positions == null) { + if (scorersOrdered == null) + throw new IllegalStateException("no positions requested for this scorer"); + // only created if needed for this scorer - no penalty for non-positional queries + positions = new ConjunctionPositionIterator(this, scorersOrdered); + } + return positions; } } Index: lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.Term; +import org.apache.lucene.search.positions.PositionIntervalIterator; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; @@ -202,6 +203,12 @@ return docIdSetIterator.advance(target); } + public void setPositionCollector (Collector c) { + if (query != null) { + ((Scorer)docIdSetIterator).setPositionCollector(c); + } + } + private Collector wrapCollector(final Collector collector) { return new Collector() { @Override @@ -216,6 +223,11 @@ } @Override + public void collectPositions(Scorer scorer, PositionIntervalIterator positions) throws IOException { + collector.collectPositions(scorer, positions); + } + + @Override public void setNextReader(AtomicReaderContext context) throws IOException { collector.setNextReader(context); } @@ -224,6 +236,16 @@ public boolean acceptsDocsOutOfOrder() { return collector.acceptsDocsOutOfOrder(); } + + @Override + public boolean needsPositions() { + return collector.needsPositions(); + } + + @Override + public boolean needsPayloads() { + return collector.needsPayloads(); + } }; } @@ -246,6 +268,15 @@ return super.score(collector, max, firstDocID); } } + + @Override + public PositionIntervalIterator positions() throws IOException { + if (docIdSetIterator instanceof Scorer) { + return ((Scorer) docIdSetIterator).positions(); + } else { + return super.positions(); + } + } } @Override Index: lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java (working copy) @@ -20,6 +20,7 @@ import java.util.List; import java.io.IOException; +import org.apache.lucene.search.positions.ConjunctionPositionIterator; import org.apache.lucene.search.positions.DisjunctionPositionIterator; import org.apache.lucene.search.positions.PositionIntervalIterator; import org.apache.lucene.util.ScorerDocQueue; @@ -59,6 +60,8 @@ private float currentScore = Float.NaN; + private PositionIntervalIterator positions = null; + /** Construct a DisjunctionScorer. * @param weight The weight to be used. * @param needsPositions @@ -86,7 +89,7 @@ this.minimumNrMatchers = minimumNrMatchers; this.subScorers = subScorers; - initScorerDocQueue(); + //initScorerDocQueue(); } /** Construct a DisjunctionScorer, using one as the minimum number @@ -118,7 +121,7 @@ collector.collect(currentDoc); } } - + /** Expert: Collects matching documents in a range. Hook for optimization. * Note that {@link #nextDoc()} must be called once before this method is called * for the first time. @@ -141,6 +144,9 @@ @Override public int nextDoc() throws IOException { + if (scorerDocQueue == null) { + initScorerDocQueue(); + } if (scorerDocQueue.size() < minimumNrMatchers || !advanceAfterCurrent()) { currentDoc = NO_MORE_DOCS; } @@ -221,6 +227,9 @@ */ @Override public int advance(int target) throws IOException { + if (scorerDocQueue == null) { + initScorerDocQueue(); + } if (scorerDocQueue.size() < minimumNrMatchers) { return currentDoc = NO_MORE_DOCS; } @@ -238,11 +247,20 @@ } while (true); } + public void setPositionCollector (Collector c) { + for (Scorer scorer : subScorers) { + scorer.setPositionCollector(c); + } + } + @Override public PositionIntervalIterator positions() throws IOException { - if (minimumNrMatchers > 1) { - throw new IllegalStateException("positions not implemented for minimum matches > 1"); + if (positions == null) { + if (minimumNrMatchers > 1) { + positions = new ConjunctionPositionIterator(this, subScorers.toArray(new Scorer[0]), minimumNrMatchers); + } + positions = new DisjunctionPositionIterator(this, subScorers.toArray(new Scorer[0])); } - return new DisjunctionPositionIterator(this, subScorers.toArray(new Scorer[0])); + return positions; } } Index: lucene/src/java/org/apache/lucene/search/PositionTermScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PositionTermScorer.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/PositionTermScorer.java (working copy) @@ -65,6 +65,10 @@ public boolean score(Collector c, int end, int firstDocID) throws IOException { c.setScorer(this); while (doc < end) { // for docs in window + if (positionCollector != null) { + while (positions.next() != null) + ; + } c.collect(doc); // collect score doc = docsEnum.nextDoc(); if (doc != NO_MORE_DOCS) { @@ -95,6 +99,10 @@ @Override public int nextDoc() throws IOException { + if (positionCollector != null) { + while (positions.next() != null) + ; + } doc = docsEnum.nextDoc(); if (doc != NO_MORE_DOCS) { positions.positionsPending = freq = docsEnum.freq(); @@ -120,6 +128,10 @@ */ @Override public int advance(int target) throws IOException { + if (positionCollector != null) { + while (positions.next() != null) + ; + } doc = docsEnum.advance(target); if (doc != NO_MORE_DOCS) { positions.positionsPending = freq = docsEnum.freq(); @@ -144,19 +156,23 @@ private final PositionInterval interval; int positionsPending; private final DocsAndPositionsEnum docsAndPos; + private final SinglePositionIterator singlePositionIterator; public TermPositions(DocsAndPositionsEnum docsAndPos, boolean doPayloads) { super(PositionTermScorer.this); this.docsAndPos = docsAndPos; this.interval = doPayloads ? new PayloadPosInterval(docsAndPos, this) : new PositionInterval(); - + singlePositionIterator = new SinglePositionIterator(); } @Override public PositionInterval next() throws IOException { if (--positionsPending >= 0) { interval.begin = interval.end = docsAndPos.nextPosition(); + if (positionCollector != null) { + positionCollector.collectPositions(scorer, singlePositionIterator.reset()); + } return interval; } interval.reset(); @@ -173,6 +189,39 @@ public PositionIntervalIterator[] subs(boolean inOrder) { return EMPTY; } + + @Override + public PositionIntervalIterator getTermPositions(boolean inOrder) { + return singlePositionIterator.reset(); + } + + public final class SinglePositionIterator extends PositionIntervalIterator { + private boolean hasNext; + public SinglePositionIterator() { + super(PositionTermScorer.this); + hasNext = true; + } + + public SinglePositionIterator reset() { + hasNext = true; + return this; + } + + @Override + public PositionInterval next() throws IOException { + if (hasNext) { + hasNext = false; + return TermPositions.this.interval; + } + return null; + } + + @Override + public PositionIntervalIterator[] subs(boolean inOrder) { + return EMPTY; + } + + } } private static final class PayloadPosInterval extends PositionInterval { Index: lucene/src/java/org/apache/lucene/search/Scorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Scorer.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/Scorer.java (working copy) @@ -42,6 +42,8 @@ */ public abstract class Scorer extends DocIdSetIterator { protected final Weight weight; + + protected Collector positionCollector; /** * Constructs a Scorer @@ -107,6 +109,10 @@ // TODO make abstract? throw new UnsupportedOperationException(); } + + public void setPositionCollector (Collector c) { + positionCollector = c; + } /** * A callback to gather information from a scorer and its sub-scorers. Each Index: lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java (revision 1144713) +++ lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java (working copy) @@ -22,7 +22,7 @@ /** * ConjuctionPositionIterator based on minimal interval semantics for AND - * operator + * operator. * *