Index: src/java/org/apache/lucene/search/SloppyPhraseScorer.java
===================================================================
--- src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 480218)
+++ src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy)
@@ -20,16 +20,32 @@
import org.apache.lucene.index.TermPositions;
import java.io.IOException;
+import java.util.HashMap;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
+ private PhrasePositions repeats[];
+ private boolean checkedRepeats;
- SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
int slop, byte[] norms) {
- super(weight, tps, positions, similarity, norms);
+ super(weight, tps, offsets, similarity, norms);
this.slop = slop;
}
+ /**
+ * Score a candidate doc for all slop-valid position-combinations (matches)
+ * encountered while traversing/hopping the PhrasePositions.
+ * The score contribution of a match depends on the distance: highest score for
+ * distance=0 (exact match), score gets lower as distance gets higher.
+ * example: for query "a b"~2, a document "x a b a y" can be scored twice:
+ * once for "a b" (distance=0), and once for "b a" (distance=2).
+ * Pssibly not all valid combinations are encountered, because for efficiency
+ * we always propagate the least PhrasePosition. This allows to base on
+ * PriorityQueue and move frward faster. As result, for example, document "a b c b a"
+ * would score differently for queries "a b c"~4 and "b c a"~4, although
+ * they really are equivalent. We may want to fix this in the future.
+ */
protected final float phraseFreq() throws IOException {
pq.clear();
int end = 0;
@@ -40,29 +56,77 @@
pq.put(pp); // build pq from list
}
+ // possible repetitions?
+ if (!checkedRepeats) {
+ initRepeats();
+ checkedRepeats = true;
+ }
+
float freq = 0.0f;
boolean done = false;
do {
PhrasePositions pp = (PhrasePositions) pq.pop();
- int start = pp.position;
- int next = ((PhrasePositions) pq.top()).position;
- for (int pos = start; pos <= next; pos = pp.position) {
- start = pos; // advance pp to min window
- if (!pp.nextPosition()) {
- done = true; // ran out of a term -- done
- break;
- }
+ if (!pp.repeats || termPositionsDiffer(pp)) {
+ int distance = end - pp.position;
+ if (distance <= slop)
+ freq += getSimilarity().sloppyFreq(distance); // score match
}
-
- int matchLength = end - start;
- if (matchLength <= slop)
- freq += getSimilarity().sloppyFreq(matchLength); // score match
-
- if (pp.position > end)
- end = pp.position;
- pq.put(pp); // restore pq
+ do {
+ done = !pp.nextPosition();
+ } while (!done && (pp.repeats && !termPositionsDiffer(pp)));
+ if (!done)
+ if (pp.position > end)
+ end = pp.position;
+ pq.put(pp); // restore pq
} while (!done);
return freq;
}
+
+
+ /**
+ * One time initializatin for this scorer, once all PhrasePositions have
+ * been placed for the first candidate matching document.
+ * Put in repeats[] each pp that has another pp with same position in the doc.
+ * Also mark each such pp by pp.repeats = true.
+ * Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
+ * In particular, this allows to score queries with no repetiotions with no overhead due to this computation.
+ * Example 1 - query with no repetitions: "ho my"~2
+ * Example 2 - query with repetitions: "ho my my"~2
+ * Example 3 - query with repetitions: "my ho my"~2
+ */
+ private void initRepeats() {
+ HashMap m = null;
+ for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+ int tpPos = pp.position + pp.offset;
+ for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
+ int tpPos2 = pp2.position + pp2.offset;
+ if (tpPos2 == tpPos) {
+ if (m == null)
+ m = new HashMap();
+ pp.repeats = true;
+ pp2.repeats = true;
+ m.put(pp,null);
+ m.put(pp2,null);
+ }
+ }
+ }
+ if (m!=null)
+ repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]);
+ }
+
+ // disalow two pp's to have the same tp position, so that same word twice
+ // in query would go elswhere in the matched doc
+ private boolean termPositionsDiffer(PhrasePositions pp) {
+ int tpPos = pp.position + pp.offset;
+ for (int i = 0; i < repeats.length; i++) {
+ PhrasePositions pp2 = repeats[i];
+ if (pp2 == pp)
+ continue;
+ int tpPos2 = pp2.position + pp2.offset;
+ if (tpPos2 == tpPos)
+ return false;
+ }
+ return true;
+ }
}
Index: src/java/org/apache/lucene/search/PhrasePositions.java
===================================================================
--- src/java/org/apache/lucene/search/PhrasePositions.java (revision 480218)
+++ src/java/org/apache/lucene/search/PhrasePositions.java (working copy)
@@ -20,6 +20,9 @@
import java.io.IOException;
import org.apache.lucene.index.*;
+/**
+ * Position of a term in a document that takes into account the term offset within the phrase.
+ */
final class PhrasePositions {
int doc; // current doc
int position; // position in doc
@@ -27,6 +30,7 @@
int offset; // position in phrase
TermPositions tp; // stream of positions
PhrasePositions next; // used to make lists
+ boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
PhrasePositions(TermPositions t, int o) {
tp = t;
@@ -61,6 +65,12 @@
nextPosition();
}
+ /**
+ * Go to next location of this term current document, and set
+ * position as location - offset, so that a
+ * matching exact phrase is easily identified when all PhrasePositions
+ * have exactly the same position.
+ */
final boolean nextPosition() throws IOException {
if (count-- > 0) { // read subsequent pos's
position = tp.nextPosition() - offset;
Index: src/java/org/apache/lucene/search/PhraseScorer.java
===================================================================
--- src/java/org/apache/lucene/search/PhraseScorer.java (revision 480218)
+++ src/java/org/apache/lucene/search/PhraseScorer.java (working copy)
@@ -21,6 +21,16 @@
import org.apache.lucene.index.*;
+/** Expert: Scoring functionality for phrase queries.
+ *
A document is considered matching if it contains the phrase-query terms
+ * at "valid" positons. What "valid positions" are
+ * depends on the type of the phrase query: for an exact phrase query terms are required
+ * to appear in adjacent locations, while for a sloppy phrase query some distance between
+ * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes
+ * is invoked for each doucment containing all the phrase query terms, in order to
+ * compute the frequency of the phrase query in that document. A non zero frequency
+ * means a match.
+ */
abstract class PhraseScorer extends Scorer {
private Weight weight;
protected byte[] norms;
@@ -31,19 +41,23 @@
protected PhraseQueue pq;
protected PhrasePositions first, last;
- private float freq;
+ private float freq; //prhase frequency in current doc as computed by phraseFreq().
- PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
byte[] norms) {
super(similarity);
this.norms = norms;
this.weight = weight;
this.value = weight.getValue();
- // convert tps to a list
+ // convert tps to a list of phrase positions.
+ // note: phrase-position differs from term-position in that its position
+ // reflects the phrase offset: pp.pos = tp.pos - offset.
+ // this allows to easily identify a matching (exact) phrase
+ // when all PhrasePositions have exactly the same position.
for (int i = 0; i < tps.length; i++) {
- PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
+ PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]);
if (last != null) { // add next to end of list
last.next = pp;
} else
@@ -94,6 +108,7 @@
}
public boolean skipTo(int target) throws IOException {
+ firstTime = false;
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
more = pp.skipTo(target);
}
@@ -102,6 +117,13 @@
return doNext();
}
+ /**
+ * For a document containing all the phrase query terms, compute the
+ * frequency of the phrase in that document.
+ * A non zero frequency means a match.
+ *
Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations.
+ * @return frequency of the phrase in current doc, 0 if not found.
+ */
protected abstract float phraseFreq() throws IOException;
private void init() throws IOException {
Index: src/java/org/apache/lucene/search/ExactPhraseScorer.java
===================================================================
--- src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 480218)
+++ src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy)
@@ -22,19 +22,22 @@
final class ExactPhraseScorer extends PhraseScorer {
- ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
byte[] norms) {
- super(weight, tps, positions, similarity, norms);
+ super(weight, tps, offsets, similarity, norms);
}
protected final float phraseFreq() throws IOException {
// sort list with pq
+ pq.clear();
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
pp.firstPosition();
pq.put(pp); // build pq from list
}
pqToList(); // rebuild list from pq
+ // for counting how many times the exact phrase is found in current document,
+ // just count how many times all PhrasePosition's have exactly the same position.
int freq = 0;
do { // find position w/ all terms
while (first.position < last.position) { // scan forward in first
Index: src/java/org/apache/lucene/search/PhraseQueue.java
===================================================================
--- src/java/org/apache/lucene/search/PhraseQueue.java (revision 480218)
+++ src/java/org/apache/lucene/search/PhraseQueue.java (working copy)
@@ -28,7 +28,12 @@
PhrasePositions pp1 = (PhrasePositions)o1;
PhrasePositions pp2 = (PhrasePositions)o2;
if (pp1.doc == pp2.doc)
- return pp1.position < pp2.position;
+ if (pp1.position == pp2.position)
+ // same doc and pp.position, so decide by actual term positions.
+ // rely on: pp.position == tp.position - offset.
+ return pp1.offset < pp2.offset;
+ else
+ return pp1.position < pp2.position;
else
return pp1.doc < pp2.doc;
}