Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PosCollector.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PosCollector.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PosCollector.java (revision 0) @@ -0,0 +1,81 @@ +package org.apache.lucene.search.spanhighlight; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Scorer.ScorerVisitor; + +/** + * for testing only - collect the first maxDocs docs and throw the rest away + */ +public class PosCollector extends Collector { + + int count; + ScorePosDoc docs[]; + + public PosCollector (int maxDocs) { + docs = new ScorePosDoc[maxDocs]; + } + + protected Scorer scorer; + + public void collect(int doc) throws IOException { + if (count >= docs.length) + return; + assert (scorer != null); + // Use this to retrieve the field for each span from its Query? + // + docs[count++] = new ScorePosDoc (doc, scorer.score(), scorer.positions()); +// if (doc == 6) { +// // print out the query: +// scorer.visitScorers(new SpanScorerVisitor()); +// } + } + + public boolean acceptsDocsOutOfOrder() { + // tickle the searcher so we get Boolean2Scorer instead of BooleanScorer + // because BooleanScorer doesn't have positions() yet. + return false; + } + + public void setScorer(Scorer scorer) { + this.scorer = scorer; + } + + public Scorer getScorer () { + return scorer; + } + + public ScorePosDoc[] getDocs () { + ScorePosDoc ret[] = new ScorePosDoc[count]; + System.arraycopy(docs, 0, ret, 0, count); + return ret; + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + } + + @Override + public boolean needsPositions() { return true; } + + /** + * For testing/investigation + * @author sokolov + * + */ + class SpanScorerVisitor extends ScorerVisitor { + + @Override + public void visitRequired (Query parent, Query child, Scorer scorer) { + System.out.println ("parent=" + parent + ", child=" + child); + } + + @Override + public void visitOptional (Query parent, Query child, Scorer scorer) { + System.out.println ("parent=" + parent + ", child=" + child); + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PosHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PosHighlighter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PosHighlighter.java (revision 0) @@ -0,0 +1,165 @@ +package org.apache.lucene.search.spanhighlight; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.search.highlight.DefaultEncoder; +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.search.positions.PositionIntervalIterator; +import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +public class PosHighlighter { + private Formatter formatter; + private Encoder encoder; + + + public PosHighlighter() { + this(new SimpleHTMLFormatter()); + } + + public PosHighlighter(Formatter formatter) { + this(formatter,new DefaultEncoder()); + } + + public PosHighlighter(Formatter formatter, Encoder encoder) { + this.formatter = formatter; + this.encoder = encoder; + } + + /** + * + * @param scorer a Scorer positioned at the docID for which highlighting + * fragments are to be retrieved. + * @param mergeContiguousFragments + * @param maxNumFragments the number of fragments to return + * @param fragSize, the requested size of fragments, in characters. Fragments may + * be smaller if there is insufficient text. There is an qttempt to put the first match + * in the center of the fragment. + * @return the first maxNumFragments TextFragments, ordered by (descending) score. + * Each fragment corresponds to a Span, and its score is the Span's score. + * @throws IOException + */ + public String[] getFirstFragments( + ScorePosDoc doc, + IndexReader reader, + String fieldName, + boolean mergeContiguousFragments, + int maxNumFragments, + int fragSize) throws IOException + { + PositionOffsetMapper pom = new PositionOffsetMapper (); + // TODO coalesce adjacent positions - for now just see if we can + // retrieve the start positions + // FIXME: test error cases: for non-stored fields, and fields w/no term vectors + reader.getTermFreqVector(doc.doc, fieldName, pom); + String text = reader.document(doc.doc).getFieldable(fieldName).stringValue(); + + PositionTreeIterator positions = new PositionTreeIterator (doc.positions); + PositionInterval pos = positions.next(); + assert (pos != null); // no positions matched? + String[] frags = new String[maxNumFragments]; + StringBuilder buf = new StringBuilder(); + int ifrag = 0; + while (ifrag < maxNumFragments && pos != null) { + + int matchStart = pom.getStartOffset(pos.begin); + int matchEnd = pom.getEndOffset(pos.end); + int fragStart = Math.max(0, matchStart - (fragSize - (matchEnd-matchStart)) / 2); + int fragEnd = Math.min(fragStart+fragSize, text.length()); + + for (;;) { + if (matchStart > fragStart) + buf.append (text, fragStart, matchStart); + buf.append (""); // TODO - parameterize + buf.append (text, matchStart, matchEnd); + buf.append (""); + if (fragEnd <= matchEnd) { + break; + } + pos = positions.next(); + if (pos != null) + matchStart = pom.getStartOffset(pos.begin); + if (pos == null || matchStart >= fragEnd) { + // Either there is no match or the next match position comes after the end of this fragment + // In either case, grab some more text to fill out the fragment + buf.append(text, matchEnd, fragEnd); + break; + } + // include the next match in this fragment + fragStart = matchEnd; + matchEnd = pom.getEndOffset(pos.end); + } + // emit a completed fragment + frags[ifrag++] = buf.toString(); + buf = new StringBuilder(); + } + return frags; + } + + /** + * @param scorer + * @param mergeContiguousFragments + * @param maxNumFragments number of fragments to retrieve + * @return The first maxNumFragments TextFragments, in document order: + * sorted by their (start ing, then ending) span position + */ + public TextFragment[] getBestFragments( + ScorePosDoc doc, + IndexReader reader, + boolean mergeContiguousFragments, + int maxNumFragments) + { + // TODO - get maxNumFragments top fragments by score + return null; + } + + class PositionOffsetMapper extends TermVectorMapper { + private int maxPos = 0; + private static final int BUF_SIZE = 128; + int startOffset[] = new int[BUF_SIZE], endOffset[] = new int[BUF_SIZE]; + + public void setExpectations(String field, int numTerms, + boolean storeOffsets, boolean storePositions) { + } + + public void map(BytesRef term, int frequency, + TermVectorOffsetInfo[] offsets, int[] positions) + { + for (int i = 0; i < positions.length; i++) { + int pos = positions[i]; + if (pos >= startOffset.length) { + grow (pos + BUF_SIZE); + maxPos = pos; + } else if (pos > maxPos) { + maxPos = pos; + } + startOffset[pos] = offsets[i].getStartOffset(); + endOffset[pos] = offsets[i].getEndOffset(); + } + } + + private void grow (int size) { + startOffset = ArrayUtil.grow (startOffset, size); + endOffset = ArrayUtil.grow (endOffset, size); + } + + public int getStartOffset(int pos) { + return startOffset[pos]; + } + + public int getEndOffset(int pos) { + return endOffset[pos]; + } + + public int getMaxPosition() { + return maxPos; + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PositionTreeIterator.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PositionTreeIterator.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/PositionTreeIterator.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.lucene.search.spanhighlight; + +import java.io.IOException; + +import org.apache.lucene.search.positions.PositionIntervalIterator; +import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval; + +// retrieves the positions from the leaves of a tree of PositionIntervalIterators +public class PositionTreeIterator { + + static class Frame { + Frame (PositionIntervalIterator positions) { + this.positions = positions; + subs = positions.subs(true); + if (subs.length == 0) + subs = null; + isub = (subs != null) ? 0 : -1; + } + PositionIntervalIterator positions; + PositionIntervalIterator subs[]; + int isub; + }; + + Frame stack[] = new Frame[32]; + int curframe = 0; + + public PositionTreeIterator (PositionIntervalIterator root) { + stack[0] = new Frame(root); + } + + public PositionInterval next() throws IOException { + PositionInterval pos; + if (curframe < 0) + return null; + Frame f = stack[curframe]; + if (f.subs == null) { + pos = stack[curframe].positions.next(); + if (pos != null) + return pos; + } + else if (f.isub < f.subs.length) { + if (curframe >= stack.length) { + throw new ArrayIndexOutOfBoundsException ("PositionTreeIterator stack depth > 32"); + } + stack[++curframe] = new Frame (f.subs[f.isub++]); + return next(); + } + // pop + --curframe; + return next(); + } + } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/ScorePosDoc.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/ScorePosDoc.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/spanhighlight/ScorePosDoc.java (revision 0) @@ -0,0 +1,16 @@ +package org.apache.lucene.search.spanhighlight; + +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.positions.PositionIntervalIterator; + +/** Used to accumulate span positions while scoring */ +public class ScorePosDoc extends ScoreDoc { + + public PositionIntervalIterator positions; + + public ScorePosDoc(int doc, float score, PositionIntervalIterator positions) { + super(doc, score); + assert doc == positions.docID(); + this.positions = positions; + } +} Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/spanhighlight/PosHighlighterTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/spanhighlight/PosHighlighterTest.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/spanhighlight/PosHighlighterTest.java (revision 0) @@ -0,0 +1,189 @@ +package org.apache.lucene.search.spanhighlight; + +import java.io.FileInputStream; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.codecs.CoreCodecProvider; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.spans.MockSpanQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.SimpleFSDirectory; +import org.apache.lucene.util.LuceneTestCase; +/** + * Notes: to fully implement, we need: + * 1) ability to walk the individual terms that matched, possibly in a hierarchical way + * if we want to implement really clever highlighting? + * 2) some Collector api like the one I made up, and support in Searcher + * 3) All (or more) queries implemented + * + * For hl perf testing we could test term queries only using the current impl + * TODO: return smaller snippets (currently hl the whole doc) + * @author sokolov + * + */ +public class PosHighlighterTest extends LuceneTestCase { + + protected final static String F="f"; + protected Analyzer analyzer; + protected QueryParser parser; + protected Directory dir; + protected IndexSearcher searcher; + + @Override + public void setUp() throws Exception { + super.setUp(); + analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false); + parser = new QueryParser(TEST_VERSION_CURRENT, F, analyzer ); + dir = new SimpleFSDirectory(TEMP_DIR); + } + + @Override + public void tearDown() throws Exception { + if( searcher != null ){ + searcher.close(); + searcher = null; + } + dir.close(); + super.tearDown(); + } + + // make 1 doc with multi valued field + protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception { + IndexWriterConfig config = new IndexWriterConfig( + TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE); + config.setCodecProvider(new CoreCodecProvider()); + config.getCodecProvider().setDefaultFieldCodec("Standard"); + IndexWriter writer = new IndexWriter(dir, config); + if (!writer.getConfig().getCodecProvider().getFieldCodec(F).equals("Standard")) { + System.out.println ("codec=" + writer.getConfig().getCodecProvider().getFieldCodec(F)); + writer.getConfig().getCodecProvider().setFieldCodec(F, "Standard"); + } + + Document doc = new Document(); + for( String value: values ) { + Field f = new Field (F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS); + doc.add (f); + } + writer.addDocument( doc ); + writer.close(); + if (searcher != null) searcher.close(); + searcher = new IndexSearcher( dir, true ); + } + + public void testTerm () throws Exception { + make1dmfIndex(analyzer, "This is a test"); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + searcher.search(new TermQuery(new Term(F, "test")), collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 100); + assertEquals ("This is a test", frags[0]); + } + + public void testSeveralSnippets () throws Exception { + String input = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + "Let us see what happens to long in this case."; + String gold = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + "Let us see what happens to long in this case."; + make1dmfIndex(analyzer, input); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + searcher.search(new TermQuery(new Term(F, "long")), collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, input.length()); + assertEquals (gold, frags[0]); + } + + public void testBooleanAnd () throws Exception { + make1dmfIndex(analyzer, "This is a test"); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "This")), Occur.MUST)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "test")), Occur.MUST)); + searcher.search(bq, collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 50); + assertEquals ("This is a test", frags[0]); + } + + public void testBooleanAndOtherOrder () throws Exception { + make1dmfIndex(analyzer, "This is a test"); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "test")), Occur.MUST)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "This")), Occur.MUST)); + searcher.search(bq, collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 50); + // OK - this is not quite right... + assertEquals ("This is a test", frags[0]); + } + + public void testBooleanOr () throws Exception { + // OR queries not implemented yet... + make1dmfIndex(analyzer, "This is a test"); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "test")), Occur.SHOULD)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "This")), Occur.SHOULD)); + searcher.search(bq, collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 50); + assertEquals ("This is a test", frags[0]); + } + + public void testPhrase() throws Exception { + make1dmfIndex(analyzer, "This is a test"); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(F, "a")); + pq.add(new Term(F, "test")); + searcher.search(new MockSpanQuery(pq, collector.needsPayloads(), F, null), collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 50); + assertEquals ("This is a test", frags[0]); + } + + public void testWildcard () throws Exception { + make1dmfIndex(analyzer, "This is a test"); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + searcher.search(new MockSpanQuery(new WildcardQuery(new Term(F, "t*t")), collector.needsPayloads(), F, null), collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 50); + assertEquals ("This is a test", frags[0]); + } + + public void testLargerDocument() throws Exception { + InputStream in = new FileInputStream ("epistolary-novel.xml"); + make1dmfIndex(analyzer, IOUtils.toString(in)); + in.close(); + PosHighlighter ph = new PosHighlighter(); + PosCollector collector = new PosCollector (10); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause (new TermQuery(new Term(F, "unknown")), Occur.MUST)); + bq.add(new BooleanClause (new TermQuery(new Term(F, "artist")), Occur.MUST)); + searcher.search(bq, collector); + String frags[] = ph.getFirstFragments(collector.docs[0], searcher.getIndexReader(), F, true, 10, 50); + assertEquals ("is a narration by an unknown observer.\n*[[Jean Web", frags[0]); + assertEquals ("fin and Sabine]]'' by artist [[Nick Bantock]] is a", frags[1]); + } + +} Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 1140372) +++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -537,7 +537,8 @@ // TODO: should we make this // threaded...? the Collector could be sync'd? - ScorerContext scorerContext = ScorerContext.def().scoreDocsInOrder(true).topScorer(true); + ScorerContext scorerContext = ScorerContext.def().scoreDocsInOrder(true).topScorer(true) + .needsPositions(collector.needsPositions()).needsPayloads(collector.needsPayloads()); // always use single thread: if (filter == null) { for (int i = 0; i < leaves.length; i++) { // search each subreader