diff --git a/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java b/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java index b2d4afe..dc2ea00 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java @@ -17,8 +17,10 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.Term; + import java.io.IOException; -import org.apache.lucene.index.*; /** * Position of a term in a document that takes into account the term offset within the phrase. @@ -84,6 +86,10 @@ final class PhrasePositions { if (rptGroup >=0 ) { s += " rpt:"+rptGroup+",i"+rptInd; } + s += " t: [" + terms[0]; + for (int i = 1; i < terms.length; i++) + s += "," + terms[1]; + s += "]"; return s; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 1f0c1f4..db8d382 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -124,7 +124,7 @@ public class PhraseQuery extends Query { static class PostingsAndFreq implements Comparable { final TermQuery.TermDocsEnumFactory factory; - final DocsAndPositionsEnum postings; + DocsAndPositionsEnum postings; final int docFreq; final int position; final Term[] terms; @@ -132,7 +132,8 @@ public class PhraseQuery extends Query { public PostingsAndFreq(DocsAndPositionsEnum postings, TermQuery.TermDocsEnumFactory factory, int docFreq, int position, Term... terms) throws IOException { this.factory = factory; - this.postings = postings; + //this.postings = postings; + this.postings = factory.docsAndPositionsEnum(false); this.docFreq = docFreq; this.position = position; nTerms = terms==null ? 0 : terms.length; @@ -150,6 +151,10 @@ public class PhraseQuery extends Query { } } + public void reset(boolean needsOffsets) throws IOException { + this.postings = factory.docsAndPositionsEnum(needsOffsets); + } + public int compareTo(PostingsAndFreq other) { if (docFreq != other.docFreq) { return docFreq - other.docFreq; diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java index 19644b3..8389262 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java @@ -17,10 +17,10 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.search.similarities.Similarity; +import java.io.IOException; + /** Expert: Scoring functionality for phrase queries. *
A document is considered matching if it contains the phrase-query terms * at "valid" positions. What "valid positions" are @@ -37,22 +37,29 @@ abstract class PhraseScorer extends Scorer { private float freq; //phrase frequency in current doc as computed by phraseFreq(). final Similarity.SloppySimScorer docScorer; + private PhraseQuery.PostingsAndFreq[] postings; PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity.SloppySimScorer docScorer) { + Similarity.SloppySimScorer docScorer) throws IOException { super(weight); this.docScorer = docScorer; + this.postings = postings; + reset(false); + } + void reset(boolean needsOffsets) throws IOException { // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. if (postings.length > 0) { + postings[0].reset(needsOffsets); min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms); max = min; max.doc = -1; for (int i = 1; i < postings.length; i++) { + postings[i].reset(needsOffsets); PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); max.next = pp; max = pp; diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index c02007b..4a8e92e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -37,10 +37,11 @@ final class SloppyPhraseScorer extends PhraseScorer { private boolean checkedRpts; // flag to only check for repetitions in first candidate doc private boolean hasMultiTermRpts; // private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset - private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps + private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps + PhrasePositionsIterator ppIt = new PhrasePositionsIterator(); SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - int slop, Similarity.SloppySimScorer docScorer) { + int slop, Similarity.SloppySimScorer docScorer) throws IOException { super(weight, postings, docScorer); this.slop = slop; this.numPostings = postings==null ? 0 : postings.length; @@ -71,32 +72,56 @@ final class SloppyPhraseScorer extends PhraseScorer { return 0.0f; } float freq = 0.0f; - PhrasePositions pp = pq.pop(); - int matchLength = end - pp.position; - int next = pq.top().position; - while (advancePP(pp)) { - if (hasRpts && !advanceRpts(pp)) { - break; // pps exhausted - } - if (pp.position > next) { // done minimizing current match-length - if (matchLength <= slop) { - freq += docScorer.computeSlopFactor(matchLength); // score match - } - pq.add(pp); - pp = pq.pop(); - next = pq.top().position; - matchLength = end - pp.position; - } else { - int matchLength2 = end - pp.position; - if (matchLength2 < matchLength) { - matchLength = matchLength2; + + while (ppIt.next()) { + freq += docScorer.computeSlopFactor(ppIt.matchlength); + } + + return freq; + } + + private class PhrasePositionsIterator { + + public int matchlength; + public int headPosition = -1; + public int headStartOffset = 1; + public int headEndOffset = -1; + private PhrasePositions currentHead; + + public boolean next() throws IOException { + if (pq.size() < numPostings) + return false; + currentHead = pq.pop(); + matchlength = end - currentHead.position; + headPosition = currentHead.position + currentHead.offset; + headStartOffset = currentHead.postings.startOffset(); + headEndOffset = currentHead.postings.endOffset(); + int next = pq.top().position; + while (advancePP(currentHead)) { + if (hasRpts && !advanceRpts(currentHead)) + break; + if (currentHead.position > next) { // done minimizing current match-length + pq.add(currentHead); + if (matchlength <= slop) { + return true; + } + // not a match - keep going + currentHead = pq.pop(); + next = pq.top().position; + matchlength = end - currentHead.position; + } + else { + int newmatchlength = end - currentHead.position; + if (newmatchlength < matchlength) + matchlength = newmatchlength; } + headPosition = currentHead.position + currentHead.offset; + headStartOffset = currentHead.postings.startOffset(); + headEndOffset = currentHead.postings.endOffset(); } + return matchlength <= slop; } - if (matchLength <= slop) { - freq += docScorer.computeSlopFactor(matchLength); // score match - } - return freq; + } /** advance a PhrasePosition and update 'end', return false if exhausted */ @@ -481,8 +506,80 @@ final class SloppyPhraseScorer extends PhraseScorer { @Override public PositionIntervalIterator positions(boolean needsPayloads, boolean needsOffsets) throws IOException { - // nocommit implement this (and get a beer before you do so!) - throw new UnsupportedOperationException(); + // nocommit - payloads? + reset(needsOffsets); + return new SloppyPhrasePositionIntervalIterator(this); + } + + private class SloppyPhrasePositionIntervalIterator extends PositionIntervalIterator { + + private PositionInterval interval = new PositionInterval(Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); + private PositionInterval[] subintervals; + + public SloppyPhrasePositionIntervalIterator(Scorer scorer) { + super(scorer); + subintervals = new PositionInterval[numPostings]; + for (int i = 0; i < numPostings; i++) { + subintervals[i] = new PositionInterval(Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); + } + } + + @Override + public int advanceTo(int docId) throws IOException { + int currentdoc = -1; + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { + currentdoc = pp.postings.advance(docId); + } + initPhrasePositions(); + return currentdoc; + } + + @Override + public PositionInterval next() throws IOException { + if (!ppIt.next()) + return null; + fillInterval(interval, ppIt, max); + fillInterval(subintervals[0], ppIt); + int i = 1; + for (PhrasePositions pp=min.next,prev=null; prev != max; pp=(prev=pp).next, i++) { + fillInterval(subintervals[i], pp); + } + return interval; + } + + @Override + public void collect() { + collector.collectComposite(scorer, interval, currentDoc); + int i = 0; + for (PhrasePositions pp=min,prev=null; prev != max; pp=(prev=pp).next, i++) { + collector.collectLeafPosition(scorer, subintervals[i], currentDoc); + } + } + + @Override + public PositionIntervalIterator[] subs(boolean inOrder) { + return EMPTY; + } + + private void fillInterval(PositionInterval i, PhrasePositionsIterator it) { + i.begin = it.headPosition; + i.end = it.headPosition; + i.offsetBegin = it.headStartOffset; + i.offsetEnd = it.headEndOffset; + } + + private void fillInterval(PositionInterval i, PhrasePositionsIterator it, PhrasePositions end) throws IOException { + i.begin = it.headPosition; + i.end = end.position + end.offset; + i.offsetBegin = it.headStartOffset; + i.offsetEnd = end.postings.endOffset(); + } + + private void fillInterval(PositionInterval i, PhrasePositions pp) throws IOException { + i.end = i.begin = pp.offset + pp.position; + i.offsetBegin = pp.postings.startOffset(); + i.offsetEnd = pp.postings.endOffset(); + } } // private void printQueue(PrintStream ps, PhrasePositions ext, String title) { diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index ebc4458..0b5f4b0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -17,25 +17,14 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.*; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.ExactSimScorer; +import org.apache.lucene.util.*; + import java.io.IOException; import java.util.Set; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.similarities.Similarity.ExactSimScorer; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.ReaderUtil; -import org.apache.lucene.util.TermContext; -import org.apache.lucene.util.ToStringUtils; - /** * A Query that matches documents containing a term. This may be combined with * other terms with a {@link BooleanQuery}. @@ -252,7 +241,7 @@ public class TermQuery extends Query { private final Bits liveDocs; private final DocsEnum docs; private final DocsEnum docsAndFreqs; - private final TermState state; + //private final TermState state; private BytesRef term; TermDocsEnumFactory(TermsEnum termsEnum, DocsEnum docs, DocsEnum docsAndFreqs, Bits liveDocs) { @@ -266,7 +255,7 @@ public class TermQuery extends Query { this.liveDocs = liveDocs; this.docs = docs; this.docsAndFreqs = docsAndFreqs; - this.state = state; + //this.state = state; this.term = term; } @@ -276,9 +265,9 @@ public class TermQuery extends Query { public DocsAndPositionsEnum docsAndPositionsEnum(boolean offsets) throws IOException { - if (state != null) { + if (term != null) { assert term != null; - termsEnum.seekExact(term, state); + termsEnum.seekExact(term, false); } return termsEnum.docsAndPositions(liveDocs, null, offsets); } diff --git a/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java b/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java index 7d7545b..8f1dbae 100644 --- a/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java +++ b/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java @@ -16,8 +16,6 @@ package org.apache.lucene.search.positions; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; @@ -27,24 +25,14 @@ import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Weight; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; +import java.io.IOException; + public class TestPositionOffsets extends LuceneTestCase { // What am I testing here? @@ -92,15 +80,18 @@ public class TestPositionOffsets extends LuceneTestCase { writer.addDocument(doc); } - public void testTermQueryWithOffsets() throws IOException { + private void testQuery(Query query, int[][] expectedOffsets) throws IOException { + testQuery(query, expectedOffsets, true); + } + + private void testQuery(Query query, int[][] expectedOffsets, boolean needsOffsets) throws IOException { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); - addDocs(writer, true); + addDocs(writer, needsOffsets); IndexReader reader = writer.getReader(); IndexSearcher searcher = new IndexSearcher(reader); writer.close(); - Query query = new TermQuery(new Term("field", "porridge")); Weight weight = query.createWeight(searcher); IndexReaderContext topReaderContext = searcher.getTopReaderContext(); @@ -111,9 +102,10 @@ public class TestPositionOffsets extends LuceneTestCase { int nextDoc = scorer.nextDoc(); assertEquals(0, nextDoc); - PositionIntervalIterator positions = scorer.positions(false, true); - int[] startOffsets = new int[] { 6, 26, 47, 164, 184 }; - int[] endOffsets = new int[] { 14, 34, 55, 172, 192 }; + PositionIntervalIterator positions = scorer.positions(false, needsOffsets); + + int startOffsets[] = expectedOffsets[0]; + int endOffsets[] = expectedOffsets[1]; assertEquals(0, positions.advanceTo(nextDoc)); for (int i = 0; i < startOffsets.length; i++) { @@ -128,79 +120,58 @@ public class TestPositionOffsets extends LuceneTestCase { directory.close(); } - public void testTermQueryWithoutOffsets() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); - addDocs(writer, false); - - IndexReader reader = writer.getReader(); - IndexSearcher searcher = new IndexSearcher(reader); - writer.close(); + public void testTermQueryWithOffsets() throws IOException { Query query = new TermQuery(new Term("field", "porridge")); + int[] startOffsets = new int[] { 6, 26, 47, 164, 184 }; + int[] endOffsets = new int[] { 14, 34, 55, 172, 192 }; + testQuery(query, new int[][] { startOffsets, endOffsets }); + } - Weight weight = query.createWeight(searcher); - IndexReaderContext topReaderContext = searcher.getTopReaderContext(); - AtomicReaderContext[] leaves = topReaderContext.leaves(); - assertEquals(1, leaves.length); - Scorer scorer = weight.scorer(leaves[0], - true, true, leaves[0].reader().getLiveDocs()); - - int nextDoc = scorer.nextDoc(); - assertEquals(0, nextDoc); - PositionIntervalIterator positions = scorer.positions(false, false); + public void testTermQueryWithoutOffsets() throws IOException { + Query query = new TermQuery(new Term("field", "porridge")); int[] startOffsets = new int[] { -1, -1, -1, -1, -1 }; int[] endOffsets = new int[] { -1, -1, -1, -1, -1 }; - - assertEquals(0, positions.advanceTo(nextDoc)); - for (int i = 0; i < startOffsets.length; i++) { - PositionIntervalIterator.PositionInterval interval = positions.next(); - assertEquals(startOffsets[i], interval.offsetBegin); - assertEquals(endOffsets[i], interval.offsetEnd); - } - - assertNull(positions.next()); - - reader.close(); - directory.close(); + testQuery(query, new int[][] { startOffsets, endOffsets }, false); } public void testBooleanQueryWithOffsets() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); - addDocs(writer, true); - - IndexReader reader = writer.getReader(); - IndexSearcher searcher = new IndexSearcher(reader); - writer.close(); BooleanQuery query = new BooleanQuery(); query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), BooleanClause.Occur.MUST)); query.add(new BooleanClause(new TermQuery(new Term("field", "nine")), BooleanClause.Occur.MUST)); - - Weight weight = query.createWeight(searcher); - IndexReaderContext topReaderContext = searcher.getTopReaderContext(); - AtomicReaderContext[] leaves = topReaderContext.leaves(); - assertEquals(1, leaves.length); - Scorer scorer = weight.scorer(leaves[0], - true, true, leaves[0].reader().getLiveDocs()); - - int nextDoc = scorer.nextDoc(); - assertEquals(0, nextDoc); - PositionIntervalIterator positions = scorer.positions(false, true); int[] startOffsetsConj = new int[] { 6, 26, 47, 67, 143}; int[] endOffsetsConj = new int[] { 71, 71, 71, 172, 172}; - assertEquals(0, positions.advanceTo(nextDoc)); - PositionIntervalIterator.PositionInterval interval; - int i = 0; - while((interval = positions.next()) != null) { - assertEquals(startOffsetsConj[i], interval.offsetBegin); - assertEquals(endOffsetsConj[i], interval.offsetEnd); - i++; - } - assertEquals(i, startOffsetsConj.length); - assertNull(positions.next()); + testQuery(query, new int[][] { startOffsetsConj, endOffsetsConj }); + } - reader.close(); - directory.close(); + public void testExactPhraseQuery() throws IOException { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "porridge")); + query.add(new Term("field", "hot!")); + int[] startOffsetsBlock = new int[] { 0, 158 }; + int[] endOffsetsBlock = new int[] { 19, 177 }; + testQuery(query, new int[][] { startOffsetsBlock, endOffsetsBlock }); + } + + public void testSloppyPhraseQuery() throws IOException { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "hot!")); + query.setSlop(1); + int[] startOffsetsBlock = new int[] { 0, 158 }; + int[] endOffsetsBlock = new int[] { 19, 177 }; + testQuery(query, new int[][] { startOffsetsBlock, endOffsetsBlock }); + } + + public void testManyTermSloppyPhraseQuery() throws IOException { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "porridge")); + query.add(new Term("field", "pot")); + query.setSlop(2); + int[] startOffsetsBlock = new int[] { 41 }; + int[] endOffsetsBlock = new int[] { 66 }; + testQuery(query, new int[][] { startOffsetsBlock, endOffsetsBlock }); } } \ No newline at end of file diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java index 434b0af..e22aac1 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java @@ -368,6 +368,17 @@ public class PosHighlighterTest extends LuceneTestCase { frags[0]); close(); } + + public void testSloppyPhraseQuery() throws Exception { + insertDocs(analyzer, "a b c d a b c d e f"); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(F, "c")); + pq.add(new Term(F, "a")); + pq.setSlop(2); + String frags[] = doSearch(pq, 50); + assertEquals("a b c d a b c d e f", frags[0]); + close(); + } public static class BlockPositionIteratorFilter implements PositionIntervalFilter {