Index: lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java (revision 1296754)
+++ lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java (working copy)
@@ -21,12 +21,10 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.util._TestUtil;
-import org.junit.Ignore;
/**
* random sloppy phrase query tests
*/
-@Ignore("Put this back when we fix LUCENE-3821")
public class TestSloppyPhraseQuery2 extends SearchEquivalenceTestBase {
/** "A B"~N ⊆ "A B"~N+1 */
public void testIncreasingSloppiness() throws Exception {
Index: lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java (revision 1296754)
+++ lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java (working copy)
@@ -31,8 +31,9 @@
final int ord; // unique across all PhrasePositions instances
TermPositions tp; // stream of positions
PhrasePositions next; // used to make lists
- PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets
-
+ int rptGroup = -1; // >=0 indicates that this is a repeating PP
+ int rptInd; // index in the rptGroup
+
PhrasePositions(TermPositions t, int o, int ord) {
tp = t;
offset = o;
@@ -85,8 +86,8 @@
@Override
public String toString() {
String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count;
- if (nextRepeating!=null) {
- s += " rpt[ "+nextRepeating+" ]";
+ if (rptGroup >=0 ) {
+ s += " rpt:"+rptGroup;
}
return s;
}
Index: lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 1296754)
+++ lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy)
@@ -19,65 +19,69 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Comparator;
final class SloppyPhraseScorer extends PhraseScorer {
- private int slop;
- private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats
- private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc)
+
+ private int slop;
+ private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
+ private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
+ private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset
+ private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps
private PhraseQueue pq; // for advancing min position
- private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset
+ private int end; // current largest phrase position
- SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
- int slop, byte[] norms) {
- super(weight, postings, similarity, norms);
- this.slop = slop;
- }
-
- /**
- * Score a candidate doc for all slop-valid position-combinations (matches)
- * encountered while traversing/hopping the PhrasePositions.
- *
The score contribution of a match depends on the distance:
- *
- highest score for distance=0 (exact match).
- *
- score gets lower as distance gets higher.
- *
Example: for query "a b"~2, a document "x a b a y" can be scored twice:
- * once for "a b" (distance=0), and once for "b a" (distance=2).
- *
Possibly not all valid combinations are encountered, because for efficiency
- * we always propagate the least PhrasePosition. This allows to base on
- * PriorityQueue and move forward faster.
- * As result, for example, document "a b c b a"
- * would score differently for queries "a b c"~4 and "c b a"~4, although
- * they really are equivalent.
- * Similarly, for doc "a b c b a f g", query "c b"~2
- * would get same score as "g f"~2, although "c b"~2 could be matched twice.
- * We may want to fix this in the future (currently not, for performance reasons).
- */
- @Override
+ SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
+ int slop, byte[] norms) {
+ super(weight, postings, similarity, norms);
+ this.slop = slop;
+ pq = new PhraseQueue(postings.length);
+ rptStack = new PhrasePositions[postings.length];
+ }
+
+ /**
+ * Score a candidate doc for all slop-valid position-combinations (matches)
+ * encountered while traversing/hopping the PhrasePositions.
+ *
The score contribution of a match depends on the distance:
+ *
- highest score for distance=0 (exact match).
+ *
- score gets lower as distance gets higher.
+ *
Example: for query "a b"~2, a document "x a b a y" can be scored twice:
+ * once for "a b" (distance=0), and once for "b a" (distance=2).
+ *
Possibly not all valid combinations are encountered, because for efficiency
+ * we always propagate the least PhrasePosition. This allows to base on
+ * PriorityQueue and move forward faster.
+ * As result, for example, document "a b c b a"
+ * would score differently for queries "a b c"~4 and "c b a"~4, although
+ * they really are equivalent.
+ * Similarly, for doc "a b c b a f g", query "c b"~2
+ * would get same score as "g f"~2, although "c b"~2 could be matched twice.
+ * We may want to fix this in the future (currently not, for performance reasons).
+ */
+ @Override
protected float phraseFreq() throws IOException {
- int end = initPhrasePositions();
- //printPositions(System.err, "INIT DONE:");
- if (end==Integer.MIN_VALUE) {
+ if (!initPhrasePositions()) {
return 0.0f;
}
-
float freq = 0.0f;
PhrasePositions pp = pq.pop();
int matchLength = end - pp.position;
- int next = pq.size()>0 ? pq.top().position : pp.position;
- //printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength);
- while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) {
- if (pp.position > next) {
- //printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength);
+ int next = pq.top().position;
+ while (advancePP(pp)) {
+ if (hasRpts && !advanceRpts(pp)) {
+ break; // pps exhausted
+ }
+ if (pp.position > next) { // done minimizing current match-length
if (matchLength <= slop) {
freq += getSimilarity().sloppyFreq(matchLength); // score match
}
pq.add(pp);
pp = pq.pop();
- next = pq.size()>0 ? pq.top().position : pp.position;
+ next = pq.top().position;
matchLength = end - pp.position;
- //printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength);
} else {
int matchLength2 = end - pp.position;
- //printQueue(System.err, pp, "C: mlen2 repeatsEnd) {
- repeatsEnd = pp.position;
+ /** advance a PhrasePosition and update 'end', return false if exhausted */
+ private boolean advancePP(PhrasePositions pp) throws IOException {
+ if (!pp.nextPosition()) {
+ return false;
}
- if (!hasRepeats) {
- return repeatsEnd;
+ if (pp.position > end) {
+ end = pp.position;
}
- int tpPos = tpPos(pp);
- for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) {
- while (tpPos(pp2) <= tpPos) {
- if (!pp2.nextPosition()) {
- return Integer.MIN_VALUE;
- }
+ return true;
+ }
+
+ /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
+ * of the two colliding pps. Note that there can only be one collision, as by the initialization
+ * there were no collisions before pp was advanced. */
+ private boolean advanceRpts(PhrasePositions pp) throws IOException {
+ if (pp.rptGroup < 0) {
+ return true; // not a repeater
+ }
+ PhrasePositions[] rg = rptGroups[pp.rptGroup];
+ BitSet bits = new BitSet(rg.length); // for re-queuing after collisions are resolved
+ int k;
+ while((k=collide(pp)) >= 0) {
+ bits.set(k); // mark that this pp need to be re-queued
+ pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
+ if (!advancePP(pp)) {
+ return false; // exhausted
}
- tpPos = tpPos(pp2);
- if (pp2.position > repeatsEnd) {
- repeatsEnd = pp2.position;
+ }
+ // collisions resolved, now re-queue
+ // empty (partially) the queue until seeing all pps advanced for resolving collisions
+ int n = 0;
+ while (bits.cardinality() > 0) {
+ PhrasePositions pp2 = pq.pop();
+ rptStack[n++] = pp2;
+ if (pp2.rptGroup >= 0 && bits.get(pp2.rptInd)) {
+ bits.clear(pp2.rptInd);
}
- // "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position.
- // so in order to have the right "start" in matchLength computation we fake pp.position.
- // this relies on pp.nextPosition() not using pp.position.
- if (pp2.position < pp.position) {
- pp.position = pp2.position;
+ }
+ // add back to queue
+ for (int i=n-1; i>=0; i--) {
+ pq.add(rptStack[i]);
+ }
+ return true;
+ }
+
+ /** compare two pps, but only by position and offset */
+ private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
+ if (pp.position < pp2.position ||
+ (pp.position == pp2.position && pp.offset < pp2.offset)) {
+ return pp;
+ }
+ return pp2;
+ }
+
+ /** index of a pp2 colliding with pp, or -1 if none */
+ private int collide(PhrasePositions pp) {
+ int tpPos = tpPos(pp);
+ PhrasePositions[] rg = rptGroups[pp.rptGroup];
+ for (int i=0; i
- * Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query.
- * For each such group:
- *
- * - form an inner linked list of the repeating ones.
- *
- propagate all group members but first so that they land on different tpPos().
- *
- * Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation.
- * Insert to pq only non repeating PPs, or PPs that are the first in a repeating group.
+ * Check if there are repetitions
+ * If there are, find groups of repetitions.
*
* Examples:
*
@@ -143,116 +173,152 @@
* - repetitions: "ho my my"~2
*
- repetitions: "my ho my"~2
*
- * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done)
+ * @return false if PPs are exhausted (and so current doc will not be a match)
*/
- private int initPhrasePositions() throws IOException {
- int end = Integer.MIN_VALUE;
-
- // no repeats at all (most common case is also the simplest one)
- if (checkedRepeats && !hasRepeats) {
- // build queue from list
- pq.clear();
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- pp.firstPosition();
- if (pp.position > end) {
- end = pp.position;
- }
- pq.add(pp); // build pq from list
+ private boolean initPhrasePositions() throws IOException {
+ end = Integer.MIN_VALUE;
+ if (!checkedRpts) {
+ return initFirstTime();
+ }
+ if (!hasRpts) {
+ initSimple();
+ return true; // PPs available
+ }
+ return initComplex();
+ }
+
+ /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
+ private void initSimple() throws IOException {
+ pq.clear();
+ // position pps and build queue from list
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ pp.firstPosition();
+ if (pp.position > end) {
+ end = pp.position;
}
- return end;
+ pq.add(pp);
}
-
- //printPositions(System.err, "Init: 1: Bef position");
-
- // position the pp's
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ }
+
+ /** with repeats: not so simple. */
+ private boolean initComplex() throws IOException {
+ placeFirstPositions();
+ if (!advanceRepeatGroups())
+ return false; // PPs exhausted
+ fillQueue();
+ return true; // PPs available
+ }
+
+ /** move all PPs to their first position */
+ private void placeFirstPositions() throws IOException {
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
pp.firstPosition();
}
-
- //printPositions(System.err, "Init: 2: Aft position");
-
- // one time initialization for this scorer (done only for the first candidate doc)
- if (!checkedRepeats) {
- checkedRepeats = true;
- ArrayList ppsA = new ArrayList();
- PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
- // check for repeats
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- if (pp.nextRepeating != null) {
- continue; // a repetition of an earlier pp
- }
- ppsA.add(pp);
- int tpPos = tpPos(pp);
- for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
- if (
- pp2.nextRepeating != null // already detected as a repetition of an earlier pp
- || pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query!
- || tpPos(pp2) != tpPos) { // not a repetition
- continue;
- }
- // a repetition
- hasRepeats = true;
- prevB.nextRepeating = pp2; // add pp2 to the repeats linked list
- pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
- prevB = pp2;
- }
+ }
+
+ /** Fill the queue (all pps are already placed */
+ private void fillQueue() {
+ pq.clear();
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ if (pp.position > end) {
+ end = pp.position;
}
- if (hasRepeats) {
- // clean dummy markers
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- if (pp.nextRepeating == dummyPP) {
- pp.nextRepeating = null;
+ pq.add(pp);
+ }
+ }
+
+ /** At initialization (each doc), each repetition group is sorted by (query) offset,
+ * and then each pp in the group is advanced one less than its group index.
+ * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
+ * This provides the start condition: no collisions.
+ * Returns false if PPs are exhausted.
+ */
+ private boolean advanceRepeatGroups() throws IOException {
+ for (int i=0; i> rgs = gatherRptGroups();
+ if (rgs != null) {
+ hasRpts = true;
+ sortRptGroups(rgs);
+ if (!advanceRepeatGroups()) {
+ return false; // PPs exhausted
}
}
-
- //printPositions(System.err, "Init: 4: Aft advance-repeats");
-
- // build queue from non repeating pps
- pq.clear();
- for (PhrasePositions pp: nrPps) {
- if (pp.position > end) {
- end = pp.position;
+ fillQueue();
+ return true; // PPs available
+ }
+
+ /** sort each repetition group by (query) offset.
+ * Done only once (at first doc) and allows to initialize faster for each doc. */
+ private void sortRptGroups(ArrayList> rgs) {
+ rptGroups = new PhrasePositions[rgs.size()][];
+ Comparator cmprtr = new Comparator() {
+ @Override
+ public int compare(PhrasePositions pp1, PhrasePositions pp2) {
+ return pp1.offset - pp2.offset;
}
- pq.add(pp);
+ };
+ for (int i=0; i> gatherRptGroups() throws IOException {
+ ArrayList> res = null;
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ if (pp.rptGroup >=0) continue; // already marked as a repetition
+ int tpPos = tpPos(pp);
+ for (PhrasePositions pp2=pp.next; pp2!= min; pp2=pp2.next) {
+ if (
+ pp2.rptGroup >=0 // already marked as a repetition
+ || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query!
+ || tpPos(pp2) != tpPos) { // not a repetition
+ continue;
+ }
+ // a repetition
+ if (res == null) {
+ res = new ArrayList>();
+ }
+ int g = pp.rptGroup;
+ if (g < 0) {
+ g = res.size();
+ pp.rptGroup = g;
+ ArrayList rl = new ArrayList(2);
+ rl.add(pp);
+ res.add(rl);
+ }
+ pp2.rptGroup = g;
+ res.get(g).add(pp2);
+ }
+ }
+ return res;
+ }
+
/** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
private final int tpPos(PhrasePositions pp) {
return pp.position + pp.offset;
}
-
-// private void printPositions(PrintStream ps, String title) {
-// ps.println();
-// ps.println("---- "+title);
-// int k = 0;
-// if (nrPps!=null) {
-// for (PhrasePositions pp: nrPps) {
-// ps.println(" " + k++ + " " + pp);
-// }
-// } else {
-// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {
-// ps.println(" " + k++ + " " + pp);
-// }
-// }
-// }
// private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
// ps.println();
@@ -273,4 +339,5 @@
// }
// }
// }
+
}