Index: src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java
===================================================================
--- src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java (revision 1166541)
+++ src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java (working copy)
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
@@ -27,6 +29,7 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.store.Directory;
public class TestSloppyPhraseQuery extends LuceneTestCase {
@@ -141,7 +144,7 @@
IndexSearcher searcher = newSearcher(reader);
TopDocs td = searcher.search(query,null,10);
- //System.out.println("slop: "+slop+" query: "+query+" doc: "+doc+" Expecting number of hits: "+expectedNumResults+" maxScore="+td.getMaxScore());
+ //System.err.println("slop: "+slop+" query: "+query+" doc: "+doc+" Expecting number of hits: "+expectedNumResults+" maxScore="+td.getMaxScore());
assertEquals("slop: "+slop+" query: "+query+" doc: "+doc+" Wrong number of hits", expectedNumResults, td.totalHits);
//QueryUtils.check(query,searcher);
@@ -171,4 +174,147 @@
return query;
}
+ /** checks that no scores or freqs are infinite */
+ private void assertSaneScoring(PhraseQuery pq, IndexSearcher searcher) throws Exception {
+ searcher.search(pq, new Collector() {
+ Scorer scorer;
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ this.scorer = scorer;
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ assertFalse(Float.isInfinite(scorer.freq()));
+ assertFalse(Float.isInfinite(scorer.score()));
+ }
+
+ @Override
+ public void setNextReader(AtomicReaderContext context) throws IOException {
+ // do nothing
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+ });
+ QueryUtils.check(random, pq, searcher);
+ }
+
+ // LUCENE-3215
+ public void testSlopWithHoles() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random, dir);
+ FieldType customType = new FieldType(TextField.TYPE_UNSTORED);
+ customType.setOmitNorms(true);
+ Field f = new Field("lyrics", customType, "");
+ Document doc = new Document();
+ doc.add(f);
+ f.setValue("drug drug");
+ iw.addDocument(doc);
+ f.setValue("drug druggy drug");
+ iw.addDocument(doc);
+ f.setValue("drug druggy druggy drug");
+ iw.addDocument(doc);
+ f.setValue("drug druggy drug druggy drug");
+ iw.addDocument(doc);
+ IndexReader ir = iw.getReader();
+ iw.close();
+ IndexSearcher is = newSearcher(ir);
+
+ PhraseQuery pq = new PhraseQuery();
+ // "drug the drug"~1
+ pq.add(new Term("lyrics", "drug"), 1);
+ pq.add(new Term("lyrics", "drug"), 4);
+ pq.setSlop(0);
+ assertEquals(0, is.search(pq, 4).totalHits);
+ pq.setSlop(1);
+ assertEquals(3, is.search(pq, 4).totalHits);
+ pq.setSlop(2);
+ assertEquals(4, is.search(pq, 4).totalHits);
+ is.close();
+ ir.close();
+ dir.close();
+ }
+
+ // LUCENE-3215
+ public void testInfiniteFreq1() throws Exception {
+ String document = "drug druggy drug drug drug";
+
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random, dir);
+ Document doc = new Document();
+ doc.add(newField("lyrics", document, new FieldType(TextField.TYPE_UNSTORED)));
+ iw.addDocument(doc);
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher is = newSearcher(ir);
+ PhraseQuery pq = new PhraseQuery();
+ // "drug the drug"~1
+ pq.add(new Term("lyrics", "drug"), 1);
+ pq.add(new Term("lyrics", "drug"), 3);
+ pq.setSlop(1);
+ assertSaneScoring(pq, is);
+ is.close();
+ ir.close();
+ dir.close();
+ }
+
+ // LUCENE-3215
+ public void testInfiniteFreq2() throws Exception {
+ String document =
+ "So much fun to be had in my head " +
+ "No more sunshine " +
+ "So much fun just lying in my bed " +
+ "No more sunshine " +
+ "I can't face the sunlight and the dirt outside " +
+ "Wanna stay in 666 where this darkness don't lie " +
+ "Drug drug druggy " +
+ "Got a feeling sweet like honey " +
+ "Drug drug druggy " +
+ "Need sensation like my baby " +
+ "Show me your scars you're so aware " +
+ "I'm not barbaric I just care " +
+ "Drug drug drug " +
+ "I need a reflection to prove I exist " +
+ "No more sunshine " +
+ "I am a victim of designer blitz " +
+ "No more sunshine " +
+ "Dance like a robot when you're chained at the knee " +
+ "The C.I.A say you're all they'll ever need " +
+ "Drug drug druggy " +
+ "Got a feeling sweet like honey " +
+ "Drug drug druggy " +
+ "Need sensation like my baby " +
+ "Snort your lines you're so aware " +
+ "I'm not barbaric I just care " +
+ "Drug drug druggy " +
+ "Got a feeling sweet like honey " +
+ "Drug drug druggy " +
+ "Need sensation like my baby";
+
+ Directory dir = newDirectory();
+
+ RandomIndexWriter iw = new RandomIndexWriter(random, dir);
+ Document doc = new Document();
+ doc.add(newField("lyrics", document, new FieldType(TextField.TYPE_UNSTORED)));
+ iw.addDocument(doc);
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher is = newSearcher(ir);
+
+ PhraseQuery pq = new PhraseQuery();
+ // "drug the drug"~5
+ pq.add(new Term("lyrics", "drug"), 1);
+ pq.add(new Term("lyrics", "drug"), 3);
+ pq.setSlop(5);
+ assertSaneScoring(pq, is);
+ is.close();
+ ir.close();
+ dir.close();
+ }
}
Index: src/java/org/apache/lucene/search/SloppyPhraseScorer.java
===================================================================
--- src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 1166541)
+++ src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy)
@@ -18,216 +18,257 @@
*/
import java.io.IOException;
-import java.util.LinkedHashSet;
+import java.util.ArrayList;
final class SloppyPhraseScorer extends PhraseScorer {
- private int slop;
- private PhrasePositions repeats[];
- private PhrasePositions tmpPos[]; // for flipping repeating pps.
- private boolean checkedRepeats;
+ private int slop;
+ private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats
+ private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc)
+ private PhraseQueue pq; // for advancing min position
+ private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset
+
+ SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
+ int slop, Similarity.SloppyDocScorer docScorer) {
+ super(weight, postings, docScorer);
+ this.slop = slop;
+ }
+
+ /**
+ * Score a candidate doc for all slop-valid position-combinations (matches)
+ * encountered while traversing/hopping the PhrasePositions.
+ *
The score contribution of a match depends on the distance:
+ *
- highest score for distance=0 (exact match).
+ *
- score gets lower as distance gets higher.
+ *
Example: for query "a b"~2, a document "x a b a y" can be scored twice:
+ * once for "a b" (distance=0), and once for "b a" (distance=2).
+ *
Possibly not all valid combinations are encountered, because for efficiency
+ * we always propagate the least PhrasePosition. This allows to base on
+ * PriorityQueue and move forward faster.
+ * As result, for example, document "a b c b a"
+ * would score differently for queries "a b c"~4 and "c b a"~4, although
+ * they really are equivalent.
+ * Similarly, for doc "a b c b a f g", query "c b"~2
+ * would get same score as "g f"~2, although "c b"~2 could be matched twice.
+ * We may want to fix this in the future (currently not, for performance reasons).
+ */
+ @Override
+ protected float phraseFreq() throws IOException {
+ int end = initPhrasePositions();
+ //printPositions(System.err, "INIT DONE:");
+ if (end==Integer.MIN_VALUE) {
+ return 0.0f;
+ }
- SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
- int slop, Similarity.SloppyDocScorer docScorer) throws IOException {
- super(weight, postings, docScorer);
- this.slop = slop;
+ float freq = 0.0f;
+ PhrasePositions pp = pq.pop();
+ int matchLength = end - pp.position;
+ int next = pq.size()>0 ? pq.top().position : pp.position;
+ //printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength);
+ while ( pp.nextPosition() &&
+ (!hasRepeats || (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE)) {
+ if (pp.position > next) {
+ //printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength);
+ if (matchLength <= slop) {
+ freq += docScorer.computeSlopFactor(matchLength); // score match
+ }
+ pq.add(pp);
+ pp = pq.pop();
+ next = pq.size()>0 ? pq.top().position : pp.position;
+ matchLength = end - pp.position;
+ //printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength);
+ } else {
+ int matchLength2 = end - pp.position;
+ //printQueue(System.err, pp, "C: mlen2Possibly not all valid combinations are encountered, because for efficiency
- * we always propagate the least PhrasePosition. This allows to base on
- * PriorityQueue and move forward faster.
- * As result, for example, document "a b c b a"
- * would score differently for queries "a b c"~4 and "c b a"~4, although
- * they really are equivalent.
- * Similarly, for doc "a b c b a f g", query "c b"~2
- * would get same score as "g f"~2, although "c b"~2 could be matched twice.
- * We may want to fix this in the future (currently not, for performance reasons).
- */
- @Override
- protected float phraseFreq() throws IOException {
- int end = initPhrasePositions();
-
- float freq = 0.0f;
- boolean done = (end<0);
- while (!done) {
- PhrasePositions pp = pq.pop();
- int start = pp.position;
- int next = pq.top().position;
-
- boolean tpsDiffer = true;
- for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
- if (pos<=next && tpsDiffer)
- start = pos; // advance pp to min window
- if (!pp.nextPosition()) {
- done = true; // ran out of a term -- done
- break;
- }
- PhrasePositions pp2 = null;
- tpsDiffer = !pp.repeats || (pp2 = termPositionsConflict(pp))==null;
- if (pp2!=null && pp2!=pp) {
- pp = flip(pp,pp2); // flip pp to pp2
- }
- }
-
- int matchLength = end - start;
- if (matchLength <= slop)
- freq += docScorer.computeSlopFactor(matchLength); // score match
-
- if (pp.position > end)
- end = pp.position;
- pq.add(pp); // restore pq
+ /**
+ * Advance repeating pps of an input non-repeating pp.
+ * Does nothing if pp has no repeats.
+ * Return a modified 'end' in case one of the repeats exceeds original 'end'.
+ * Dirty trick: modifies pp's position to that of least repeater of pp (needed when due to holes repeaters' positions are "back").
+ */
+ private int advanceRepeats(PhrasePositions pp, int end) throws IOException {
+ int tpPos = tpPos(pp);
+ int repeatsEnd = end;
+ if (pp.position > repeatsEnd) {
+ repeatsEnd = pp.position;
+ }
+ for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) {
+ while (tpPos(pp2) <= tpPos) {
+ if (!pp2.nextPosition()) {
+ return Integer.MIN_VALUE;
}
-
- return freq;
- }
-
- // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
- // assumes: pp!=pp2, pp2 in pq, pp not in pq.
- // called only when there are repeating pps.
- private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) {
- int n=0;
- PhrasePositions pp3;
- //pop until finding pp2
- while ((pp3=pq.pop()) != pp2) {
- tmpPos[n++] = pp3;
}
- //insert back all but pp2
- for (n--; n>=0; n--) {
- pq.insertWithOverflow(tmpPos[n]);
+ tpPos = tpPos(pp2);
+ if (pp2.position > repeatsEnd) {
+ repeatsEnd = pp2.position;
}
- //insert pp back
- pq.add(pp);
- return pp2;
+ // dirty trick: with holes, given a pp, its repeating pp2 might have smaller position.
+ // so in order to have the right "start" in matchLength computation we fake pp.position.
+ // this relies on pp.nextPosition() not using pp.position.
+ if (pp2.position < pp.position) {
+ pp.position = pp2.position;
+ }
}
+ return repeatsEnd;
+ }
- /**
- * Init PhrasePositions in place.
- * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
- *
- Put in repeats[] each pp that has another pp with same position in the doc.
- * This relies on that the position in PP is computed as (TP.position - offset) and
- * so by adding offset we actually compare positions and identify that the two are
- * the same term.
- * An exclusion to this is two distinct terms in the same offset in query and same
- * position in doc. This case is detected by comparing just the (query) offsets,
- * and two such PPs are not considered "repeating".
- *
- Also mark each such pp by pp.repeats = true.
- *
Later can consult with repeats[] in termPositionsConflict(pp), making that check efficient.
- * In particular, this allows to score queries with no repetitions with no overhead due to this computation.
- *
- Example 1 - query with no repetitions: "ho my"~2
- *
- Example 2 - query with repetitions: "ho my my"~2
- *
- Example 3 - query with repetitions: "my ho my"~2
- *
Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
- * @return end (max position), or -1 if any term ran out (i.e. done)
- * @throws IOException
- */
- private int initPhrasePositions() throws IOException {
- int end = 0;
-
- // no repeats at all (most common case is also the simplest one)
- if (checkedRepeats && repeats==null) {
- // build queue from list
- pq.clear();
- for (PhrasePositions pp = first; pp != null; pp = pp.next) {
- pp.firstPosition();
- if (pp.position > end)
- end = pp.position;
- pq.add(pp); // build pq from list
- }
- return end;
+ /**
+ * Initialize PhrasePositions in place.
+ * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
+ *
+ * - Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query.
+ *
- For each such group:
+ *
+ * - form an inner linked list of the repeating ones.
+ *
- propagate all group members but first so that they land on different tpPos().
+ *
+ * - Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation.
+ *
- Insert to pq only non repeating PPs, or PPs that are the first in a repeating group.
+ *
+ * Examples:
+ *
+ * - no repetitions: "ho my"~2
+ *
- repetitions: "ho my my"~2
+ *
- repetitions: "my ho my"~2
+ *
+ * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done)
+ */
+ private int initPhrasePositions() throws IOException {
+ int end = Integer.MIN_VALUE;
+
+ // no repeats at all (most common case is also the simplest one)
+ if (checkedRepeats && !hasRepeats) {
+ // build queue from list
+ pq.clear();
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ pp.firstPosition();
+ if (pp.position > end) {
+ end = pp.position;
}
-
- // position the pp's
- for (PhrasePositions pp = first; pp != null; pp = pp.next)
- pp.firstPosition();
-
- // one time initialization for this scorer
- if (!checkedRepeats) {
- checkedRepeats = true;
- // check for repeats
- LinkedHashSet m = null; // see comment (*) below why order is important
- for (PhrasePositions pp = first; pp != null; pp = pp.next) {
- int tpPos = pp.position + pp.offset;
- for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
- if (pp.offset == pp2.offset) {
- continue; // not a repetition: the two PPs are originally in same offset in the query!
- }
- int tpPos2 = pp2.position + pp2.offset;
- if (tpPos2 == tpPos) {
- if (m == null)
- m = new LinkedHashSet();
- pp.repeats = true;
- pp2.repeats = true;
- m.add(pp);
- m.add(pp2);
- }
- }
- }
- if (m!=null)
- repeats = m.toArray(new PhrasePositions[0]);
+ pq.add(pp); // build pq from list
+ }
+ return end;
+ }
+
+ //printPositions(System.err, "Init: 1: Bef position");
+
+ // position the pp's
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ pp.firstPosition();
+ }
+
+ //printPositions(System.err, "Init: 2: Aft position");
+
+ // one time initialization for this scorer (done only for the first candidate doc)
+ if (!checkedRepeats) {
+ checkedRepeats = true;
+ ArrayList ppsA = new ArrayList();
+ PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
+ // check for repeats
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ if (pp.nextRepeating != null) {
+ continue; // a repetition of an earlier pp
}
-
- // with repeats must advance some repeating pp's so they all start with differing tp's
- // (*) It is important that pps are handled by their original order in the query,
- // because we advance the pp with larger offset, and so processing them in that order
- // allows to cover all pairs.
- if (repeats!=null) {
- for (int i = 0; i < repeats.length; i++) {
- PhrasePositions pp = repeats[i];
- PhrasePositions pp2;
- while ((pp2 = termPositionsConflict(pp)) != null) {
- if (!pp2.nextPosition()) // among pps that do not differ, advance the pp with higher offset
- return -1; // ran out of a term -- done
- }
- }
+ ppsA.add(pp);
+ int tpPos = tpPos(pp);
+ for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
+ if (
+ pp2.nextRepeating != null // already detected as a repetition of an earlier pp
+ || pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query!
+ || tpPos(pp2) != tpPos) { // not a repetition
+ continue;
+ }
+ // a repetition
+ hasRepeats = true;
+ prevB.nextRepeating = pp2; // add pp2 to the repeats linked list
+ pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
+ prevB = pp2;
}
-
- // build queue from list
- pq.clear();
- for (PhrasePositions pp = first; pp != null; pp = pp.next) {
- if (pp.position > end)
- end = pp.position;
- pq.add(pp); // build pq from list
+ }
+ if (hasRepeats) {
+ // clean dummy markers
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ if (pp.nextRepeating == dummyPP) {
+ pp.nextRepeating = null;
+ }
}
-
- if (repeats!=null) {
- tmpPos = new PhrasePositions[pq.size()];
+ }
+ nrPps = ppsA.toArray(new PhrasePositions[0]);
+ pq = new PhraseQueue(nrPps.length);
+ }
+
+ //printPositions(System.err, "Init: 3: Aft check-repeats");
+
+ // with repeats must advance some repeating pp's so they all start with differing tp's
+ if (hasRepeats) {
+ for (PhrasePositions pp: nrPps) {
+ if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) {
+ return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc)
}
-
- return end;
+ }
}
+
+ //printPositions(System.err, "Init: 4: Aft advance-repeats");
+
+ // build queue from non repeating pps
+ pq.clear();
+ for (PhrasePositions pp: nrPps) {
+ if (pp.position > end) {
+ end = pp.position;
+ }
+ pq.add(pp);
+ }
+
+ return end;
+ }
+
+ /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
+ private final int tpPos(PhrasePositions pp) {
+ return pp.position + pp.offset;
+ }
+
+// private void printPositions(PrintStream ps, String title) {
+// ps.println();
+// ps.println("---- "+title);
+// int k = 0;
+// if (nrPps!=null) {
+// for (PhrasePositions pp: nrPps) {
+// ps.println(" " + k++ + " " + pp);
+// }
+// } else {
+// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {
+// ps.println(" " + k++ + " " + pp);
+// }
+// }
+// }
- /**
- * We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
- * in the query of the same word would go elsewhere in the matched doc.
- * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
- * out of the first two PPs found to not differ.
- */
- private PhrasePositions termPositionsConflict(PhrasePositions pp) {
- // efficiency note: a more efficient implementation could keep a map between repeating
- // pp's, so that if pp1a, pp1b, pp1c are repeats of term1, and pp2a, pp2b are repeats
- // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
- // However this would complicate code, for a rather rare case, so choice is to compromise here.
- int tpPos = pp.position + pp.offset;
- for (int i = 0; i < repeats.length; i++) {
- PhrasePositions pp2 = repeats[i];
- if (pp2 == pp) {
- continue;
- }
- if (pp.offset == pp2.offset) {
- continue; // not a repetition: the two PPs are originally in same offset in the query!
- }
- int tpPos2 = pp2.position + pp2.offset;
- if (tpPos2 == tpPos) {
- return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
- }
- }
- return null;
- }
+// private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
+// ps.println();
+// ps.println("---- "+title);
+// ps.println("EXT: "+ext);
+// PhrasePositions[] t = new PhrasePositions[pq.size()];
+// if (pq.size()>0) {
+// t[0] = pq.pop();
+// ps.println(" " + 0 + " " + t[0]);
+// for (int i=1; i=0; i--) {
+// pq.add(t[i]);
+// }
+// }
+// }
}
Index: src/java/org/apache/lucene/search/PhrasePositions.java
===================================================================
--- src/java/org/apache/lucene/search/PhrasePositions.java (revision 1166541)
+++ src/java/org/apache/lucene/search/PhrasePositions.java (working copy)
@@ -31,7 +31,7 @@
final int ord; // unique across all PhrasePositions instances
final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists
- boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
+ PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
this.postings = postings;
@@ -41,7 +41,7 @@
final boolean next() throws IOException { // increments to next doc
doc = postings.nextDoc();
- if (doc == postings.NO_MORE_DOCS) {
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
return false;
}
return true;
@@ -49,7 +49,7 @@
final boolean skipTo(int target) throws IOException {
doc = postings.advance(target);
- if (doc == postings.NO_MORE_DOCS) {
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
return false;
}
return true;
@@ -73,4 +73,14 @@
} else
return false;
}
+
+ /** for debug purposes */
+ @Override
+ public String toString() {
+ String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count;
+ if (nextRepeating!=null) {
+ s += " rpt[ "+nextRepeating+" ]";
+ }
+ return s;
+ }
}
Index: src/java/org/apache/lucene/search/PhraseScorer.java
===================================================================
--- src/java/org/apache/lucene/search/PhraseScorer.java (revision 1166541)
+++ src/java/org/apache/lucene/search/PhraseScorer.java (working copy)
@@ -24,23 +24,20 @@
* at "valid" positions. What "valid positions" are
* depends on the type of the phrase query: for an exact phrase query terms are required
* to appear in adjacent locations, while for a sloppy phrase query some distance between
- * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes
+ * the terms is allowed. The abstract method {@link PhraseScorer#phraseFreq()} of extending classes
* is invoked for each document containing all the phrase query terms, in order to
* compute the frequency of the phrase query in that document. A non zero frequency
* means a match.
*/
abstract class PhraseScorer extends Scorer {
- private boolean firstTime = true;
- private boolean more = true;
- protected PhraseQueue pq;
- protected PhrasePositions first, last;
+ PhrasePositions min, max;
private float freq; //phrase frequency in current doc as computed by phraseFreq().
- protected final Similarity.SloppyDocScorer docScorer;
+ final Similarity.SloppyDocScorer docScorer;
PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
- Similarity.SloppyDocScorer docScorer) throws IOException {
+ Similarity.SloppyDocScorer docScorer) {
super(weight);
this.docScorer = docScorer;
@@ -49,75 +46,64 @@
// reflects the phrase offset: pp.pos = tp.pos - offset.
// this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position.
- for (int i = 0; i < postings.length; i++) {
- PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
- if (last != null) { // add next to end of list
- last.next = pp;
- } else {
- first = pp;
+ if (postings.length > 0) {
+ min = new PhrasePositions(postings[0].postings, postings[0].position, 0);
+ max = min;
+ max.doc = -1;
+ for (int i = 1; i < postings.length; i++) {
+ PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
+ max.next = pp;
+ max = pp;
+ max.doc = -1;
}
- last = pp;
+ max.next = min; // make it cyclic for easier manipulation
}
-
- pq = new PhraseQueue(postings.length); // construct empty pq
- first.doc = -1;
}
@Override
- public int docID() { return first.doc; }
+ public int docID() { return max.doc; }
@Override
public int nextDoc() throws IOException {
- if (firstTime) {
- init();
- firstTime = false;
- } else if (more) {
- more = last.next(); // trigger further scanning
- }
- if (!doNext()) {
- first.doc = NO_MORE_DOCS;
- }
- return first.doc;
+ return advance(max.doc);
}
- // next without initial increment
- private boolean doNext() throws IOException {
- while (more) {
- while (more && first.doc < last.doc) { // find doc w/ all the terms
- more = first.skipTo(last.doc); // skip first upto last
- firstToLast(); // and move it to the end
- }
-
- if (more) {
- // found a doc with all of the terms
- freq = phraseFreq(); // check for phrase
- if (freq == 0.0f) // no match
- more = last.next(); // trigger further scanning
- else
- return true; // found a match
- }
- }
- return false; // no more matches
- }
-
@Override
public float score() throws IOException {
- return docScorer.score(first.doc, freq);
+ return docScorer.score(max.doc, freq);
}
+ private boolean advanceMin(int target) throws IOException {
+ if (!min.skipTo(target)) {
+ max.doc = NO_MORE_DOCS; // for further calls to docID()
+ return false;
+ }
+ min = min.next; // cyclic
+ max = max.next; // cyclic
+ return true;
+ }
+
@Override
public int advance(int target) throws IOException {
- firstTime = false;
- for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
- more = pp.skipTo(target);
- }
- if (more) {
- sort(); // re-sort
- }
- if (!doNext()) {
- first.doc = NO_MORE_DOCS;
- }
- return first.doc;
+ freq = 0.0f;
+ if (!advanceMin(target)) {
+ return NO_MORE_DOCS;
+ }
+ boolean restart=false;
+ while (freq == 0.0f) {
+ while (min.doc < max.doc || restart) {
+ restart = false;
+ if (!advanceMin(max.doc)) {
+ return NO_MORE_DOCS;
+ }
+ }
+ // found a doc with all of the terms
+ freq = phraseFreq(); // check for phrase
+ restart = true;
+ }
+
+ // found a match
+ return max.doc;
}
/**
@@ -135,45 +121,8 @@
*
Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations.
* @return frequency of the phrase in current doc, 0 if not found.
*/
- protected abstract float phraseFreq() throws IOException;
+ abstract float phraseFreq() throws IOException;
- private void init() throws IOException {
- for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
- more = pp.next();
- }
- if (more) {
- sort();
- }
- }
-
- private void sort() {
- pq.clear();
- for (PhrasePositions pp = first; pp != null; pp = pp.next) {
- pq.add(pp);
- }
- pqToList();
- }
-
- protected final void pqToList() {
- last = first = null;
- while (pq.top() != null) {
- PhrasePositions pp = pq.pop();
- if (last != null) { // add next to end of list
- last.next = pp;
- } else
- first = pp;
- last = pp;
- pp.next = null;
- }
- }
-
- protected final void firstToLast() {
- last.next = first; // move first to end of list
- last = first;
- first = first.next;
- last.next = null;
- }
-
@Override
public String toString() { return "scorer(" + weight + ")"; }