Index: src/java/org/apache/lucene/search/SloppyPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 480218) +++ src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy) @@ -20,16 +20,32 @@ import org.apache.lucene.index.TermPositions; import java.io.IOException; +import java.util.HashMap; final class SloppyPhraseScorer extends PhraseScorer { private int slop; + private PhrasePositions repeats[]; + private boolean checkedRepeats; - SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms) { - super(weight, tps, positions, similarity, norms); + super(weight, tps, offsets, similarity, norms); this.slop = slop; } + /** + * Score a candidate doc for all slop-valid position-combinations (matches) + * encountered while traversing/hopping the PhrasePositions. + * The score contribution of a match depends on the distance: highest score for + * distance=0 (exact match), score gets lower as distance gets higher. + * example: for query "a b"~2, a document "x a b a y" can be scored twice: + * once for "a b" (distance=0), and once for "b a" (distance=2). + * Pssibly not all valid combinations are encountered, because for efficiency + * we always propagate the least PhrasePosition. This allows to base on + * PriorityQueue and move frward faster. As result, for example, document "a b c b a" + * would score differently for queries "a b c"~4 and "b c a"~4, although + * they really are equivalent. We may want to fix this in the future. + */ protected final float phraseFreq() throws IOException { pq.clear(); int end = 0; @@ -40,29 +56,77 @@ pq.put(pp); // build pq from list } + // possible repetitions? + if (!checkedRepeats) { + initRepeats(); + checkedRepeats = true; + } + float freq = 0.0f; boolean done = false; do { PhrasePositions pp = (PhrasePositions) pq.pop(); - int start = pp.position; - int next = ((PhrasePositions) pq.top()).position; - for (int pos = start; pos <= next; pos = pp.position) { - start = pos; // advance pp to min window - if (!pp.nextPosition()) { - done = true; // ran out of a term -- done - break; - } + if (!pp.repeats || termPositionsDiffer(pp)) { + int distance = end - pp.position; + if (distance <= slop) + freq += getSimilarity().sloppyFreq(distance); // score match } - - int matchLength = end - start; - if (matchLength <= slop) - freq += getSimilarity().sloppyFreq(matchLength); // score match - - if (pp.position > end) - end = pp.position; - pq.put(pp); // restore pq + do { + done = !pp.nextPosition(); + } while (!done && (pp.repeats && !termPositionsDiffer(pp))); + if (!done) + if (pp.position > end) + end = pp.position; + pq.put(pp); // restore pq } while (!done); return freq; } + + + /** + * One time initializatin for this scorer, once all PhrasePositions have + * been placed for the first candidate matching document. + * Put in repeats[] each pp that has another pp with same position in the doc. + * Also mark each such pp by pp.repeats = true. + * Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. + * In particular, this allows to score queries with no repetiotions with no overhead due to this computation. + * Example 1 - query with no repetitions: "ho my"~2 + * Example 2 - query with repetitions: "ho my my"~2 + * Example 3 - query with repetitions: "my ho my"~2 + */ + private void initRepeats() { + HashMap m = null; + for (PhrasePositions pp = first; pp != null; pp = pp.next) { + int tpPos = pp.position + pp.offset; + for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { + int tpPos2 = pp2.position + pp2.offset; + if (tpPos2 == tpPos) { + if (m == null) + m = new HashMap(); + pp.repeats = true; + pp2.repeats = true; + m.put(pp,null); + m.put(pp2,null); + } + } + } + if (m!=null) + repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]); + } + + // disalow two pp's to have the same tp position, so that same word twice + // in query would go elswhere in the matched doc + private boolean termPositionsDiffer(PhrasePositions pp) { + int tpPos = pp.position + pp.offset; + for (int i = 0; i < repeats.length; i++) { + PhrasePositions pp2 = repeats[i]; + if (pp2 == pp) + continue; + int tpPos2 = pp2.position + pp2.offset; + if (tpPos2 == tpPos) + return false; + } + return true; + } } Index: src/java/org/apache/lucene/search/PhrasePositions.java =================================================================== --- src/java/org/apache/lucene/search/PhrasePositions.java (revision 480218) +++ src/java/org/apache/lucene/search/PhrasePositions.java (working copy) @@ -20,6 +20,9 @@ import java.io.IOException; import org.apache.lucene.index.*; +/** + * Position of a term in a document that takes into account the term offset within the phrase. + */ final class PhrasePositions { int doc; // current doc int position; // position in doc @@ -27,6 +30,7 @@ int offset; // position in phrase TermPositions tp; // stream of positions PhrasePositions next; // used to make lists + boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) PhrasePositions(TermPositions t, int o) { tp = t; @@ -61,6 +65,12 @@ nextPosition(); } + /** + * Go to next location of this term current document, and set + * position as location - offset, so that a + * matching exact phrase is easily identified when all PhrasePositions + * have exactly the same position. + */ final boolean nextPosition() throws IOException { if (count-- > 0) { // read subsequent pos's position = tp.nextPosition() - offset; Index: src/java/org/apache/lucene/search/PhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/PhraseScorer.java (revision 480218) +++ src/java/org/apache/lucene/search/PhraseScorer.java (working copy) @@ -21,6 +21,16 @@ import org.apache.lucene.index.*; +/** Expert: Scoring functionality for phrase queries. + *
A document is considered matching if it contains the phrase-query terms + * at "valid" positons. What "valid positions" are + * depends on the type of the phrase query: for an exact phrase query terms are required + * to appear in adjacent locations, while for a sloppy phrase query some distance between + * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes + * is invoked for each doucment containing all the phrase query terms, in order to + * compute the frequency of the phrase query in that document. A non zero frequency + * means a match. + */ abstract class PhraseScorer extends Scorer { private Weight weight; protected byte[] norms; @@ -31,19 +41,23 @@ protected PhraseQueue pq; protected PhrasePositions first, last; - private float freq; + private float freq; //prhase frequency in current doc as computed by phraseFreq(). - PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) { super(similarity); this.norms = norms; this.weight = weight; this.value = weight.getValue(); - // convert tps to a list + // convert tps to a list of phrase positions. + // note: phrase-position differs from term-position in that its position + // reflects the phrase offset: pp.pos = tp.pos - offset. + // this allows to easily identify a matching (exact) phrase + // when all PhrasePositions have exactly the same position. for (int i = 0; i < tps.length; i++) { - PhrasePositions pp = new PhrasePositions(tps[i], positions[i]); + PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else @@ -94,6 +108,7 @@ } public boolean skipTo(int target) throws IOException { + firstTime = false; for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.skipTo(target); } @@ -102,6 +117,13 @@ return doNext(); } + /** + * For a document containing all the phrase query terms, compute the + * frequency of the phrase in that document. + * A non zero frequency means a match. + *
Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations. + * @return frequency of the phrase in current doc, 0 if not found. + */ protected abstract float phraseFreq() throws IOException; private void init() throws IOException { Index: src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 480218) +++ src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -22,19 +22,22 @@ final class ExactPhraseScorer extends PhraseScorer { - ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) { - super(weight, tps, positions, similarity, norms); + super(weight, tps, offsets, similarity, norms); } protected final float phraseFreq() throws IOException { // sort list with pq + pq.clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.firstPosition(); pq.put(pp); // build pq from list } pqToList(); // rebuild list from pq + // for counting how many times the exact phrase is found in current document, + // just count how many times all PhrasePosition's have exactly the same position. int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first Index: src/java/org/apache/lucene/search/PhraseQueue.java =================================================================== --- src/java/org/apache/lucene/search/PhraseQueue.java (revision 480218) +++ src/java/org/apache/lucene/search/PhraseQueue.java (working copy) @@ -28,7 +28,12 @@ PhrasePositions pp1 = (PhrasePositions)o1; PhrasePositions pp2 = (PhrasePositions)o2; if (pp1.doc == pp2.doc) - return pp1.position < pp2.position; + if (pp1.position == pp2.position) + // same doc and pp.position, so decide by actual term positions. + // rely on: pp.position == tp.position - offset. + return pp1.offset < pp2.offset; + else + return pp1.position < pp2.position; else return pp1.doc < pp2.doc; }