Index: lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java (revision 1296754) +++ lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java (working copy) @@ -21,12 +21,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.util._TestUtil; -import org.junit.Ignore; /** * random sloppy phrase query tests */ -@Ignore("Put this back when we fix LUCENE-3821") public class TestSloppyPhraseQuery2 extends SearchEquivalenceTestBase { /** "A B"~N ⊆ "A B"~N+1 */ public void testIncreasingSloppiness() throws Exception { Index: lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java (revision 1296754) +++ lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java (working copy) @@ -31,8 +31,9 @@ final int ord; // unique across all PhrasePositions instances TermPositions tp; // stream of positions PhrasePositions next; // used to make lists - PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets - + int rptGroup = -1; // >=0 indicates that this is a repeating PP + int rptInd; // index in the rptGroup + PhrasePositions(TermPositions t, int o, int ord) { tp = t; offset = o; @@ -85,8 +86,8 @@ @Override public String toString() { String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count; - if (nextRepeating!=null) { - s += " rpt[ "+nextRepeating+" ]"; + if (rptGroup >=0 ) { + s += " rpt:"+rptGroup; } return s; } Index: lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 1296754) +++ lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy) @@ -19,65 +19,69 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Comparator; final class SloppyPhraseScorer extends PhraseScorer { - private int slop; - private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats - private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc) + + private int slop; + private boolean checkedRpts; // flag to only check for repetitions in first candidate doc + private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc) + private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset + private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps private PhraseQueue pq; // for advancing min position - private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset + private int end; // current largest phrase position - SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, - int slop, byte[] norms) { - super(weight, postings, similarity, norms); - this.slop = slop; - } - - /** - * Score a candidate doc for all slop-valid position-combinations (matches) - * encountered while traversing/hopping the PhrasePositions. - *
The score contribution of a match depends on the distance: - *
- highest score for distance=0 (exact match). - *
- score gets lower as distance gets higher. - *
Example: for query "a b"~2, a document "x a b a y" can be scored twice: - * once for "a b" (distance=0), and once for "b a" (distance=2). - *
Possibly not all valid combinations are encountered, because for efficiency - * we always propagate the least PhrasePosition. This allows to base on - * PriorityQueue and move forward faster. - * As result, for example, document "a b c b a" - * would score differently for queries "a b c"~4 and "c b a"~4, although - * they really are equivalent. - * Similarly, for doc "a b c b a f g", query "c b"~2 - * would get same score as "g f"~2, although "c b"~2 could be matched twice. - * We may want to fix this in the future (currently not, for performance reasons). - */ - @Override + SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, + int slop, byte[] norms) { + super(weight, postings, similarity, norms); + this.slop = slop; + pq = new PhraseQueue(postings.length); + rptStack = new PhrasePositions[postings.length]; + } + + /** + * Score a candidate doc for all slop-valid position-combinations (matches) + * encountered while traversing/hopping the PhrasePositions. + *
The score contribution of a match depends on the distance: + *
- highest score for distance=0 (exact match). + *
- score gets lower as distance gets higher. + *
Example: for query "a b"~2, a document "x a b a y" can be scored twice: + * once for "a b" (distance=0), and once for "b a" (distance=2). + *
Possibly not all valid combinations are encountered, because for efficiency + * we always propagate the least PhrasePosition. This allows to base on + * PriorityQueue and move forward faster. + * As result, for example, document "a b c b a" + * would score differently for queries "a b c"~4 and "c b a"~4, although + * they really are equivalent. + * Similarly, for doc "a b c b a f g", query "c b"~2 + * would get same score as "g f"~2, although "c b"~2 could be matched twice. + * We may want to fix this in the future (currently not, for performance reasons). + */ + @Override protected float phraseFreq() throws IOException { - int end = initPhrasePositions(); - //printPositions(System.err, "INIT DONE:"); - if (end==Integer.MIN_VALUE) { + if (!initPhrasePositions()) { return 0.0f; } - float freq = 0.0f; PhrasePositions pp = pq.pop(); int matchLength = end - pp.position; - int next = pq.size()>0 ? pq.top().position : pp.position; - //printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength); - while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) { - if (pp.position > next) { - //printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength); + int next = pq.top().position; + while (advancePP(pp)) { + if (hasRpts && !advanceRpts(pp)) { + break; // pps exhausted + } + if (pp.position > next) { // done minimizing current match-length if (matchLength <= slop) { freq += getSimilarity().sloppyFreq(matchLength); // score match } pq.add(pp); pp = pq.pop(); - next = pq.size()>0 ? pq.top().position : pp.position; + next = pq.top().position; matchLength = end - pp.position; - //printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength); } else { int matchLength2 = end - pp.position; - //printQueue(System.err, pp, "C: mlen2 repeatsEnd) { - repeatsEnd = pp.position; + /** advance a PhrasePosition and update 'end', return false if exhausted */ + private boolean advancePP(PhrasePositions pp) throws IOException { + if (!pp.nextPosition()) { + return false; } - if (!hasRepeats) { - return repeatsEnd; + if (pp.position > end) { + end = pp.position; } - int tpPos = tpPos(pp); - for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) { - while (tpPos(pp2) <= tpPos) { - if (!pp2.nextPosition()) { - return Integer.MIN_VALUE; - } + return true; + } + + /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser + * of the two colliding pps. Note that there can only be one collision, as by the initialization + * there were no collisions before pp was advanced. */ + private boolean advanceRpts(PhrasePositions pp) throws IOException { + if (pp.rptGroup < 0) { + return true; // not a repeater + } + PhrasePositions[] rg = rptGroups[pp.rptGroup]; + BitSet bits = new BitSet(rg.length); // for re-queuing after collisions are resolved + int k; + while((k=collide(pp)) >= 0) { + bits.set(k); // mark that this pp need to be re-queued + pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps + if (!advancePP(pp)) { + return false; // exhausted } - tpPos = tpPos(pp2); - if (pp2.position > repeatsEnd) { - repeatsEnd = pp2.position; + } + // collisions resolved, now re-queue + // empty (partially) the queue until seeing all pps advanced for resolving collisions + int n = 0; + while (bits.cardinality() > 0) { + PhrasePositions pp2 = pq.pop(); + rptStack[n++] = pp2; + if (pp2.rptGroup >= 0 && bits.get(pp2.rptInd)) { + bits.clear(pp2.rptInd); } - // "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position. - // so in order to have the right "start" in matchLength computation we fake pp.position. - // this relies on pp.nextPosition() not using pp.position. - if (pp2.position < pp.position) { - pp.position = pp2.position; + } + // add back to queue + for (int i=n-1; i>=0; i--) { + pq.add(rptStack[i]); + } + return true; + } + + /** compare two pps, but only by position and offset */ + private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) { + if (pp.position < pp2.position || + (pp.position == pp2.position && pp.offset < pp2.offset)) { + return pp; + } + return pp2; + } + + /** index of a pp2 colliding with pp, or -1 if none */ + private int collide(PhrasePositions pp) { + int tpPos = tpPos(pp); + PhrasePositions[] rg = rptGroups[pp.rptGroup]; + for (int i=0; i - *
  • Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query. - *
  • For each such group: - *
      - *
    • form an inner linked list of the repeating ones. - *
    • propagate all group members but first so that they land on different tpPos(). - *
    - *
  • Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation. - *
  • Insert to pq only non repeating PPs, or PPs that are the first in a repeating group. + *
  • Check if there are repetitions + *
  • If there are, find groups of repetitions. * * Examples: *
      @@ -143,116 +173,152 @@ *
    1. repetitions: "ho my my"~2 *
    2. repetitions: "my ho my"~2 *
    - * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done) + * @return false if PPs are exhausted (and so current doc will not be a match) */ - private int initPhrasePositions() throws IOException { - int end = Integer.MIN_VALUE; - - // no repeats at all (most common case is also the simplest one) - if (checkedRepeats && !hasRepeats) { - // build queue from list - pq.clear(); - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - pp.firstPosition(); - if (pp.position > end) { - end = pp.position; - } - pq.add(pp); // build pq from list + private boolean initPhrasePositions() throws IOException { + end = Integer.MIN_VALUE; + if (!checkedRpts) { + return initFirstTime(); + } + if (!hasRpts) { + initSimple(); + return true; // PPs available + } + return initComplex(); + } + + /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */ + private void initSimple() throws IOException { + pq.clear(); + // position pps and build queue from list + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + pp.firstPosition(); + if (pp.position > end) { + end = pp.position; } - return end; + pq.add(pp); } - - //printPositions(System.err, "Init: 1: Bef position"); - - // position the pp's - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + } + + /** with repeats: not so simple. */ + private boolean initComplex() throws IOException { + placeFirstPositions(); + if (!advanceRepeatGroups()) + return false; // PPs exhausted + fillQueue(); + return true; // PPs available + } + + /** move all PPs to their first position */ + private void placeFirstPositions() throws IOException { + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max pp.firstPosition(); } - - //printPositions(System.err, "Init: 2: Aft position"); - - // one time initialization for this scorer (done only for the first candidate doc) - if (!checkedRepeats) { - checkedRepeats = true; - ArrayList ppsA = new ArrayList(); - PhrasePositions dummyPP = new PhrasePositions(null, -1, -1); - // check for repeats - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - if (pp.nextRepeating != null) { - continue; // a repetition of an earlier pp - } - ppsA.add(pp); - int tpPos = tpPos(pp); - for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) { - if ( - pp2.nextRepeating != null // already detected as a repetition of an earlier pp - || pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query! - || tpPos(pp2) != tpPos) { // not a repetition - continue; - } - // a repetition - hasRepeats = true; - prevB.nextRepeating = pp2; // add pp2 to the repeats linked list - pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list - prevB = pp2; - } + } + + /** Fill the queue (all pps are already placed */ + private void fillQueue() { + pq.clear(); + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + if (pp.position > end) { + end = pp.position; } - if (hasRepeats) { - // clean dummy markers - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - if (pp.nextRepeating == dummyPP) { - pp.nextRepeating = null; + pq.add(pp); + } + } + + /** At initialization (each doc), each repetition group is sorted by (query) offset, + * and then each pp in the group is advanced one less than its group index. + * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc. + * This provides the start condition: no collisions. + * Returns false if PPs are exhausted. + */ + private boolean advanceRepeatGroups() throws IOException { + for (int i=0; i> rgs = gatherRptGroups(); + if (rgs != null) { + hasRpts = true; + sortRptGroups(rgs); + if (!advanceRepeatGroups()) { + return false; // PPs exhausted } } - - //printPositions(System.err, "Init: 4: Aft advance-repeats"); - - // build queue from non repeating pps - pq.clear(); - for (PhrasePositions pp: nrPps) { - if (pp.position > end) { - end = pp.position; + fillQueue(); + return true; // PPs available + } + + /** sort each repetition group by (query) offset. + * Done only once (at first doc) and allows to initialize faster for each doc. */ + private void sortRptGroups(ArrayList> rgs) { + rptGroups = new PhrasePositions[rgs.size()][]; + Comparator cmprtr = new Comparator() { + @Override + public int compare(PhrasePositions pp1, PhrasePositions pp2) { + return pp1.offset - pp2.offset; } - pq.add(pp); + }; + for (int i=0; i> gatherRptGroups() throws IOException { + ArrayList> res = null; + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + if (pp.rptGroup >=0) continue; // already marked as a repetition + int tpPos = tpPos(pp); + for (PhrasePositions pp2=pp.next; pp2!= min; pp2=pp2.next) { + if ( + pp2.rptGroup >=0 // already marked as a repetition + || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query! + || tpPos(pp2) != tpPos) { // not a repetition + continue; + } + // a repetition + if (res == null) { + res = new ArrayList>(); + } + int g = pp.rptGroup; + if (g < 0) { + g = res.size(); + pp.rptGroup = g; + ArrayList rl = new ArrayList(2); + rl.add(pp); + res.add(rl); + } + pp2.rptGroup = g; + res.get(g).add(pp2); + } + } + return res; + } + /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */ private final int tpPos(PhrasePositions pp) { return pp.position + pp.offset; } - -// private void printPositions(PrintStream ps, String title) { -// ps.println(); -// ps.println("---- "+title); -// int k = 0; -// if (nrPps!=null) { -// for (PhrasePositions pp: nrPps) { -// ps.println(" " + k++ + " " + pp); -// } -// } else { -// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) { -// ps.println(" " + k++ + " " + pp); -// } -// } -// } // private void printQueue(PrintStream ps, PhrasePositions ext, String title) { // ps.println(); @@ -273,4 +339,5 @@ // } // } // } + }