If there are, find groups of repetitions.
*
* Examples:
*
@@ -143,118 +185,305 @@
* - repetitions: "ho my my"~2
*
- repetitions: "my ho my"~2
*
- * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done)
+ * @return false if PPs are exhausted (and so current doc will not be a match)
*/
- private int initPhrasePositions() throws IOException {
- int end = Integer.MIN_VALUE;
-
- // no repeats at all (most common case is also the simplest one)
- if (checkedRepeats && !hasRepeats) {
- // build queue from list
- pq.clear();
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- pp.firstPosition();
- if (pp.position > end) {
- end = pp.position;
+ private boolean initPhrasePositions() throws IOException {
+ end = Integer.MIN_VALUE;
+ if (!checkedRpts) {
+ return initFirstTime();
+ }
+ if (!hasRpts) {
+ initSimple();
+ return true; // PPs available
+ }
+ return initComplex();
+ }
+
+ /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
+ private void initSimple() throws IOException {
+ //System.err.println("initSimple: doc: "+min.doc);
+ pq.clear();
+ // position pps and build queue from list
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ pp.firstPosition();
+ if (pp.position > end) {
+ end = pp.position;
+ }
+ pq.add(pp);
+ }
+ }
+
+ /** with repeats: not so simple. */
+ private boolean initComplex() throws IOException {
+ //System.err.println("initComplex: doc: "+min.doc);
+ placeFirstPositions();
+ if (!advanceRepeatGroups()) {
+ return false; // PPs exhausted
+ }
+ fillQueue();
+ return true; // PPs available
+ }
+
+ /** move all PPs to their first position */
+ private void placeFirstPositions() throws IOException {
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ pp.firstPosition();
+ }
+ }
+
+ /** Fill the queue (all pps are already placed */
+ private void fillQueue() {
+ pq.clear();
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ if (pp.position > end) {
+ end = pp.position;
+ }
+ pq.add(pp);
+ }
+ }
+
+ /** At initialization (each doc), each repetition group is sorted by (query) offset.
+ * This provides the start condition: no collisions.
+ * Case 1: no multi-term repeats
+ * It is sufficient to advance each pp in the group by one less than its group index.
+ * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
+ *
Case 2: multi-term repeats
+ *
+ * @return false if PPs are exhausted.
+ */
+ private boolean advanceRepeatGroups() throws IOException {
+ for (PhrasePositions[] rg: rptGroups) {
+ if (hasMultiTermRpts) {
+ // more involved, some may not collide
+ int incr;
+ for (int i=0; i= 0) {
+ PhrasePositions pp2 = lesser(pp, rg[k]);
+ if (!advancePP(pp2)) { // at initialization always advance pp with higher offset
+ return false; // exhausted
+ }
+ if (pp2.rptInd < i) { // should not happen?
+ incr = 0;
+ break;
+ }
+ }
}
- pq.add(pp); // build pq from list
+ } else {
+ // simpler, we know exactly how much to advance
+ for (int j=1; j
+ * If there are repetitions, check if multi-term postings (MTP) are involved.
+ * Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.
+ * With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".
+ * For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
+ * to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.
+ * The more complex initialization has two parts:
+ * (1) identification of repetition groups.
+ * (2) advancing repeat groups at the start of the doc.
+ * For (1), a possible solution is to just create a single repetition group,
+ * made of all repeating pps. But this would slow down the check for collisions,
+ * as all pps would need to be checked. Instead, we compute "connected regions"
+ * on the bipartite graph of postings and terms.
+ */
+ private boolean initFirstTime() throws IOException {
+ //System.err.println("initFirstTime: doc: "+min.doc);
+ checkedRpts = true;
+ placeFirstPositions();
+
+ LinkedHashMap rptTerms = repeatingTerms();
+ hasRpts = !rptTerms.isEmpty();
+
+ if (hasRpts) {
+ rptStack = new PhrasePositions[numPostings]; // needed with repetitions
+ ArrayList> rgs = gatherRptGroups(rptTerms);
+ sortRptGroups(rgs);
+ if (!advanceRepeatGroups()) {
+ return false; // PPs exhausted
+ }
+ }
- //printPositions(System.err, "Init: 1: Bef position");
-
- // position the pp's
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- pp.firstPosition();
+ fillQueue();
+ return true; // PPs available
+ }
+
+ /** sort each repetition group by (query) offset.
+ * Done only once (at first doc) and allows to initialize faster for each doc. */
+ private void sortRptGroups(ArrayList> rgs) {
+ rptGroups = new PhrasePositions[rgs.size()][];
+ Comparator cmprtr = new Comparator() {
+ public int compare(PhrasePositions pp1, PhrasePositions pp2) {
+ return pp1.offset - pp2.offset;
+ }
+ };
+ for (int i=0; i ppsA = new ArrayList();
- PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
- // check for repeats
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- if (pp.nextRepeating != null) {
- continue; // a repetition of an earlier pp
- }
- ppsA.add(pp);
+ }
+
+ /** Detect repetition groups. Done once - for first doc */
+ private ArrayList> gatherRptGroups(LinkedHashMap rptTerms) throws IOException {
+ PhrasePositions[] rpp = repeatingPPs(rptTerms);
+ ArrayList> res = new ArrayList>();
+ if (!hasMultiTermRpts) {
+ // simpler - no multi-terms - can base on positions in first doc
+ for (int i=0; i=0) continue; // already marked as a repetition
int tpPos = tpPos(pp);
- for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
+ for (int j=i+1; j=0 // already marked as a repetition
+ || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query!
|| tpPos(pp2) != tpPos) { // not a repetition
continue;
}
// a repetition
- hasRepeats = true;
- prevB.nextRepeating = pp2; // add pp2 to the repeats linked list
- pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
- prevB = pp2;
+ int g = pp.rptGroup;
+ if (g < 0) {
+ g = res.size();
+ pp.rptGroup = g;
+ ArrayList rl = new ArrayList(2);
+ rl.add(pp);
+ res.add(rl);
+ }
+ pp2.rptGroup = g;
+ res.get(g).add(pp2);
}
}
- if (hasRepeats) {
- // clean dummy markers
- for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
- if (pp.nextRepeating == dummyPP) {
- pp.nextRepeating = null;
+ } else {
+ // more involved - has multi-terms
+ ArrayList> tmp = new ArrayList>();
+ ArrayList bb = ppTermsBitSets(rpp, rptTerms);
+ unionTermGroups(bb);
+ HashMap tg = termGroups(rptTerms, bb);
+ HashSet distinctGroupIDs = new HashSet(tg.values());
+ for (int i=0; i());
+ }
+ for (PhrasePositions pp : rpp) {
+ for (Term t: pp.terms) {
+ if (rptTerms.containsKey(t)) {
+ int g = tg.get(t);
+ tmp.get(g).add(pp);
+ assert pp.rptGroup==-1 || pp.rptGroup==g;
+ pp.rptGroup = g;
}
}
}
- nrPps = ppsA.toArray(new PhrasePositions[0]);
- pq = new PhraseQueue(nrPps.length);
+ for (HashSet hs : tmp) {
+ res.add(new ArrayList(hs));
+ }
}
-
- //printPositions(System.err, "Init: 3: Aft check-repeats");
-
- // with repeats must advance some repeating pp's so they all start with differing tp's
- if (hasRepeats) {
- for (PhrasePositions pp: nrPps) {
- if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) {
- return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc)
+ return res;
+ }
+
+ /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
+ private final int tpPos(PhrasePositions pp) {
+ return pp.position + pp.offset;
+ }
+
+ /** find repeating terms and assign them ordinal values */
+ private LinkedHashMap repeatingTerms() {
+ LinkedHashMap tord = new LinkedHashMap();
+ HashMap tcnt = new HashMap();
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ for (Term t : pp.terms) {
+ Integer cnt0 = tcnt.get(t);
+ Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue());
+ tcnt.put(t, cnt);
+ if (cnt==2) {
+ tord.put(t,tord.size());
}
}
}
-
- //printPositions(System.err, "Init: 4: Aft advance-repeats");
-
- // build queue from non repeating pps
- pq.clear();
- for (PhrasePositions pp: nrPps) {
- if (pp.position > end) {
- end = pp.position;
+ return tord;
+ }
+
+ /** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
+ private PhrasePositions[] repeatingPPs(HashMap rptTerms) {
+ ArrayList rp = new ArrayList();
+ for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+ for (Term t : pp.terms) {
+ if (rptTerms.containsKey(t)) {
+ rp.add(pp);
+ hasMultiTermRpts |= (pp.terms.length > 1);
+ break;
+ }
}
- pq.add(pp);
}
-
- return end;
+ return rp.toArray(new PhrasePositions[0]);
}
- /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
- private final int tpPos(PhrasePositions pp) {
- return pp.position + pp.offset;
+ /** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
+ private ArrayList ppTermsBitSets(PhrasePositions[] rpp, HashMap tord) {
+ ArrayList bb = new ArrayList(rpp.length);
+ for (PhrasePositions pp : rpp) {
+ OpenBitSet b = new OpenBitSet(tord.size());
+ Integer ord;
+ for (Term t: pp.terms) {
+ if ((ord=tord.get(t))!=null) {
+ b.set(ord);
+ }
+ }
+ bb.add(b);
+ }
+ return bb;
}
-// private void printPositions(PrintStream ps, String title) {
-// ps.println();
-// ps.println("---- "+title);
-// int k = 0;
-// if (nrPps!=null) {
-// for (PhrasePositions pp: nrPps) {
-// ps.println(" " + k++ + " " + pp);
-// }
-// } else {
-// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {
-// ps.println(" " + k++ + " " + pp);
-// }
-// }
-// }
-
+ /** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
+ private void unionTermGroups(ArrayList bb) {
+ int incr;
+ for (int i=0; i termGroups(LinkedHashMap tord, ArrayList bb) throws IOException {
+ HashMap tg = new HashMap();
+ Term[] t = tord.keySet().toArray(new Term[0]);
+ for (int i=0; i i = termArrays.iterator();
- while (i.hasNext()) {
- Term[] terms = i.next();
+ int lastPos = -1;
+ boolean first = true;
+ for (int i=0; i 1) {
buffer.append("(");
for (int j = 0; j < terms.length; j++) {
@@ -340,8 +350,7 @@
} else {
buffer.append(terms[0].text());
}
- if (i.hasNext())
- buffer.append(" ");
+ lastPos = position;
}
buffer.append("\"");
Index: lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java (revision 1297129)
+++ lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java (working copy)
@@ -49,11 +49,11 @@
// this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position.
if (postings.length > 0) {
- min = new PhrasePositions(postings[0].postings, postings[0].position, 0);
+ min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms);
max = min;
max.doc = -1;
for (int i = 1; i < postings.length; i++) {
- PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
+ PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
max.next = pp;
max = pp;
max.doc = -1;
Index: lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (revision 1297129)
+++ lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (working copy)
@@ -18,6 +18,7 @@
*/
import java.io.IOException;
+import java.util.Arrays;
import java.util.Set;
import java.util.ArrayList;
@@ -122,23 +123,46 @@
final TermPositions postings;
final int docFreq;
final int position;
- final Term term;
+ final Term[] terms;
+ final int nTerms; // for faster comparisons
- public PostingsAndFreq(TermPositions postings, int docFreq, int position, Term term) {
+ public PostingsAndFreq(TermPositions postings, int docFreq, int position, Term... terms) {
this.postings = postings;
this.docFreq = docFreq;
this.position = position;
- this.term = term;
+ nTerms = terms==null ? 0 : terms.length;
+ if (nTerms>0) {
+ if (terms.length==1) {
+ this.terms = terms;
+ } else {
+ Term[] terms2 = new Term[terms.length];
+ System.arraycopy(terms, 0, terms2, 0, terms.length);
+ Arrays.sort(terms2);
+ this.terms = terms2;
+ }
+ } else {
+ this.terms = null;
+ }
}
public int compareTo(PostingsAndFreq other) {
- if (docFreq == other.docFreq) {
- if (position == other.position) {
- return term.compareTo(other.term);
- }
+ if (docFreq != other.docFreq) {
+ return docFreq - other.docFreq;
+ }
+ if (position != other.position) {
return position - other.position;
}
- return docFreq - other.docFreq;
+ if (nTerms != other.nTerms) {
+ return nTerms - other.nTerms;
+ }
+ if (nTerms == 0) {
+ return 0;
+ }
+ for (int i=0; i