Index: src/java/org/apache/lucene/search/SloppyPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 480218) +++ src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy) @@ -20,38 +20,58 @@ import org.apache.lucene.index.TermPositions; import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; final class SloppyPhraseScorer extends PhraseScorer { private int slop; + private PhrasePositions repeats[]; + private boolean checkedRepeats; - SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms) { - super(weight, tps, positions, similarity, norms); + super(weight, tps, offsets, similarity, norms); this.slop = slop; } + /** + * Score a candidate doc for all slop-valid position-combinations (matches) + * encountered while traversing/hopping the PhrasePositions. + *
The score contribution of a match depends on the distance: + *
- highest score for distance=0 (exact match). + *
- score gets lower as distance gets higher. + *
Example: for query "a b"~2, a document "x a b a y" can be scored twice: + * once for "a b" (distance=0), and once for "b a" (distance=2). + *
Pssibly not all valid combinations are encountered, because for efficiency + * we always propagate the least PhrasePosition. This allows to base on + * PriorityQueue and move forward faster. + * As result, for example, document "a b c b a" + * would score differently for queries "a b c"~4 and "b c a"~4, although + * they really are equivalent. + * Similarly, for doc "a b c b a f g", query "c b"~2 + * would get same score as "g f"~2, although "c b"~2 could be matched twice. + * We may want to fix this in the future (currently not, for performance reasons). + */ protected final float phraseFreq() throws IOException { - pq.clear(); - int end = 0; - for (PhrasePositions pp = first; pp != null; pp = pp.next) { - pp.firstPosition(); - if (pp.position > end) - end = pp.position; - pq.put(pp); // build pq from list - } - + int end = initPhrasePositions(); + float freq = 0.0f; - boolean done = false; - do { + boolean done = (end<0); + while (!done) { PhrasePositions pp = (PhrasePositions) pq.pop(); int start = pp.position; int next = ((PhrasePositions) pq.top()).position; - for (int pos = start; pos <= next; pos = pp.position) { - start = pos; // advance pp to min window + + boolean tpsDiffer = true; + for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) { + if (pos<=next && tpsDiffer) + start = pos; // advance pp to min window if (!pp.nextPosition()) { - done = true; // ran out of a term -- done + done = true; // ran out of a term -- done break; } + tpsDiffer = !pp.repeats || termPositionsDiffer(pp); } int matchLength = end - start; @@ -61,8 +81,109 @@ if (pp.position > end) end = pp.position; pq.put(pp); // restore pq - } while (!done); + } return freq; } + + + /** + * Init PhrasePositions in place. + * There is a one time initializatin for this scorer: + *
- Put in repeats[] each pp that has another pp with same position in the doc. + *
- Also mark each such pp by pp.repeats = true. + *
Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. + * In particular, this allows to score queries with no repetiotions with no overhead due to this computation. + *
- Example 1 - query with no repetitions: "ho my"~2 + *
- Example 2 - query with repetitions: "ho my my"~2 + *
- Example 3 - query with repetitions: "my ho my"~2 + *
Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection. + * @return end (max position), or -1 if any term ran out (i.e. done) + * @throws IOException + */ + private int initPhrasePositions() throws IOException { + int end = 0; + + // no repeats at all (most common case is also the simplest one) + if (checkedRepeats && repeats==null) { + // build queue from list + pq.clear(); + for (PhrasePositions pp = first; pp != null; pp = pp.next) { + pp.firstPosition(); + if (pp.position > end) + end = pp.position; + pq.put(pp); // build pq from list + } + return end; + } + + // position the pp's + for (PhrasePositions pp = first; pp != null; pp = pp.next) + pp.firstPosition(); + + // one time initializatin for this scorer + if (!checkedRepeats) { + checkedRepeats = true; + // check for repeats + HashMap m = null; + for (PhrasePositions pp = first; pp != null; pp = pp.next) { + int tpPos = pp.position + pp.offset; + for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { + int tpPos2 = pp2.position + pp2.offset; + if (tpPos2 == tpPos) { + if (m == null) + m = new HashMap(); + pp.repeats = true; + pp2.repeats = true; + m.put(pp,null); + m.put(pp2,null); + } + } + } + if (m!=null) + repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]); + } + + // with repeats must advance some repeating pp's so they all start with differing tp's + if (repeats!=null) { + // must propagate higher offsets first. + Arrays.sort(repeats, new Comparator() { + public int compare(Object x, Object y) { + return ((PhrasePositions) y).offset - ((PhrasePositions) x).offset; + }}); + // now advance them + for (int i = 0; i < repeats.length; i++) { + PhrasePositions pp = repeats[i]; + while (!termPositionsDiffer(pp)) { + if (!pp.nextPosition()) + return -1; // ran out of a term -- done + } + } + } + + // build queue from list + pq.clear(); + for (PhrasePositions pp = first; pp != null; pp = pp.next) { + if (pp.position > end) + end = pp.position; + pq.put(pp); // build pq from list + } + + return end; + } + + // disalow two pp's to have the same tp position, so that same word twice + // in query would go elswhere in the matched doc + private boolean termPositionsDiffer(PhrasePositions pp) { + int tpPos = pp.position + pp.offset; + for (int i = 0; i < repeats.length; i++) { + PhrasePositions pp2 = repeats[i]; + if (pp2 == pp) + continue; + int tpPos2 = pp2.position + pp2.offset; + if (tpPos2 == tpPos) + return false; + } + return true; + } } Index: src/java/org/apache/lucene/search/PhrasePositions.java =================================================================== --- src/java/org/apache/lucene/search/PhrasePositions.java (revision 480218) +++ src/java/org/apache/lucene/search/PhrasePositions.java (working copy) @@ -20,6 +20,9 @@ import java.io.IOException; import org.apache.lucene.index.*; +/** + * Position of a term in a document that takes into account the term offset within the phrase. + */ final class PhrasePositions { int doc; // current doc int position; // position in doc @@ -27,6 +30,7 @@ int offset; // position in phrase TermPositions tp; // stream of positions PhrasePositions next; // used to make lists + boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) PhrasePositions(TermPositions t, int o) { tp = t; @@ -61,6 +65,12 @@ nextPosition(); } + /** + * Go to next location of this term current document, and set + * position as location - offset, so that a + * matching exact phrase is easily identified when all PhrasePositions + * have exactly the same position. + */ final boolean nextPosition() throws IOException { if (count-- > 0) { // read subsequent pos's position = tp.nextPosition() - offset; Index: src/java/org/apache/lucene/search/PhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/PhraseScorer.java (revision 480218) +++ src/java/org/apache/lucene/search/PhraseScorer.java (working copy) @@ -21,6 +21,16 @@ import org.apache.lucene.index.*; +/** Expert: Scoring functionality for phrase queries. + *
A document is considered matching if it contains the phrase-query terms + * at "valid" positons. What "valid positions" are + * depends on the type of the phrase query: for an exact phrase query terms are required + * to appear in adjacent locations, while for a sloppy phrase query some distance between + * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes + * is invoked for each doucment containing all the phrase query terms, in order to + * compute the frequency of the phrase query in that document. A non zero frequency + * means a match. + */ abstract class PhraseScorer extends Scorer { private Weight weight; protected byte[] norms; @@ -31,19 +41,23 @@ protected PhraseQueue pq; protected PhrasePositions first, last; - private float freq; + private float freq; //prhase frequency in current doc as computed by phraseFreq(). - PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) { super(similarity); this.norms = norms; this.weight = weight; this.value = weight.getValue(); - // convert tps to a list + // convert tps to a list of phrase positions. + // note: phrase-position differs from term-position in that its position + // reflects the phrase offset: pp.pos = tp.pos - offset. + // this allows to easily identify a matching (exact) phrase + // when all PhrasePositions have exactly the same position. for (int i = 0; i < tps.length; i++) { - PhrasePositions pp = new PhrasePositions(tps[i], positions[i]); + PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else @@ -94,6 +108,7 @@ } public boolean skipTo(int target) throws IOException { + firstTime = false; for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.skipTo(target); } @@ -102,6 +117,13 @@ return doNext(); } + /** + * For a document containing all the phrase query terms, compute the + * frequency of the phrase in that document. + * A non zero frequency means a match. + *
Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations. + * @return frequency of the phrase in current doc, 0 if not found. + */ protected abstract float phraseFreq() throws IOException; private void init() throws IOException { Index: src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 480218) +++ src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -22,19 +22,22 @@ final class ExactPhraseScorer extends PhraseScorer { - ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) { - super(weight, tps, positions, similarity, norms); + super(weight, tps, offsets, similarity, norms); } protected final float phraseFreq() throws IOException { // sort list with pq + pq.clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.firstPosition(); pq.put(pp); // build pq from list } pqToList(); // rebuild list from pq + // for counting how many times the exact phrase is found in current document, + // just count how many times all PhrasePosition's have exactly the same position. int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first Index: src/java/org/apache/lucene/search/PhraseQueue.java =================================================================== --- src/java/org/apache/lucene/search/PhraseQueue.java (revision 480218) +++ src/java/org/apache/lucene/search/PhraseQueue.java (working copy) @@ -28,7 +28,12 @@ PhrasePositions pp1 = (PhrasePositions)o1; PhrasePositions pp2 = (PhrasePositions)o2; if (pp1.doc == pp2.doc) - return pp1.position < pp2.position; + if (pp1.position == pp2.position) + // same doc and pp.position, so decide by actual term positions. + // rely on: pp.position == tp.position - offset. + return pp1.offset < pp2.offset; + else + return pp1.position < pp2.position; else return pp1.doc < pp2.doc; } Index: src/test/org/apache/lucene/search/TestPhraseQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestPhraseQuery.java (revision 480218) +++ src/test/org/apache/lucene/search/TestPhraseQuery.java (working copy) @@ -35,6 +35,10 @@ * @author Erik Hatcher */ public class TestPhraseQuery extends TestCase { + + /** threshold for comparing floats */ + public static final float SCORE_COMP_THRESH = 1e-6f; + private IndexSearcher searcher; private PhraseQuery query; private RAMDirectory directory; @@ -57,8 +61,17 @@ doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.TOKENIZED)); Fieldable repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.TOKENIZED); doc.add(repeatedField); + doc.add(new Field("palindrome", "one two three two one", Field.Store.YES, Field.Index.TOKENIZED)); writer.addDocument(doc); + doc = new Document(); + doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(doc); + writer.optimize(); writer.close(); @@ -341,12 +354,169 @@ query.add(new Term("repeated", "part")); query.add(new Term("repeated", "second")); query.add(new Term("repeated", "part")); + query.setSlop(100); + + Hits hits = searcher.search(query); + assertEquals("slop of 100 just right", 1, hits.length()); + QueryUtils.check(query,searcher); + query.setSlop(99); + hits = searcher.search(query); + assertEquals("slop of 99 not enough", 0, hits.length()); + QueryUtils.check(query,searcher); + } + + // work on two docs like this: "phrase exist notexist exist found" + public void testNonExistingWrappedPhrase() throws IOException { + // non wrapped phrase that exists in 2 docs + query.add(new Term("nonexist", "phrase")); + query.add(new Term("nonexist", "notexist")); + query.add(new Term("nonexist", "found")); + query.setSlop(2); // would be found this way + Hits hits = searcher.search(query); - assertEquals(0, hits.length()); + assertEquals("nonwrapped phrase exists in 2 docs", 2, hits.length()); QueryUtils.check(query,searcher); + // wrapped phrase that exists in 2 docs + query = new PhraseQuery(); + query.add(new Term("nonexist", "phrase")); + query.add(new Term("nonexist", "exist")); + query.add(new Term("nonexist", "exist")); + query.setSlop(1); // would be found + + hits = searcher.search(query); + assertEquals("wrapped phrase exists in two docs", 2, hits.length()); + QueryUtils.check(query,searcher); + + // wrapped phrase that does not exist in any doc + query = new PhraseQuery(); + query.add(new Term("nonexist", "phrase")); + query.add(new Term("nonexist", "notexist")); + query.add(new Term("nonexist", "phrase")); + query.setSlop(1000); // would not be found no matter how high the slop is + + hits = searcher.search(query); + assertEquals("unexisting wrapped phrase does not exist in any doc", 0, hits.length()); + QueryUtils.check(query,searcher); } + /** + * Phrase of size 2 occuriong twice, once in order and once in reverse, + * because doc is a palyndrome, is counted twice. + * Also, in this case order in query does not matter. + * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same. + */ + public void testPalyndrome2() throws Exception { + + // search on non palyndrome, find phrase with no slop, using exact phrase scorer + query.setSlop(0); // to use exact phrase scorer + query.add(new Term("field", "two")); + query.add(new Term("field", "three")); + Hits hits = searcher.search(query); + assertEquals("phrase found with exact phrase scorer", 1, hits.length()); + float score0 = hits.score(0); + //System.out.println("(exact) field: two three: "+score0); + QueryUtils.check(query,searcher); + + // search on non palyndrome, find phrase with slop 2, though no slop required here. + query.setSlop(2); // to use sloppy scorer + hits = searcher.search(query); + assertEquals("just sloppy enough", 1, hits.length()); + float score1 = hits.score(0); + //System.out.println("(sloppy) field: two three: "+score1); + assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH); + QueryUtils.check(query,searcher); + + // search ordered in palyndrome, find it twice + query = new PhraseQuery(); + query.setSlop(2); // must be at least two for both ordered and reversed to match + query.add(new Term("palindrome", "two")); + query.add(new Term("palindrome", "three")); + hits = searcher.search(query); + assertEquals("just sloppy enough", 1, hits.length()); + float score2 = hits.score(0); + //System.out.println("palindrome: two three: "+score2); + QueryUtils.check(query,searcher); + + //TODO uncomment this if sloppy-phrase scoring implementation is fixed to cover this cases. (issue 736) + //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESHmaxDiff) { - throw new RuntimeException("ERROR matching docs:" - +"\n\tscorer.more=" + more + " doc="+sdoc[0] + " score="+scorerScore - +"\n\thitCollector.doc=" + doc + " score="+score - +"\n\t Scorer=" + scorer - +"\n\t Query=" + q - +"\n\t Searcher=" + s - ); + final int skip_op = 0; + final int next_op = 1; + final int orders [][] = { + {skip_op}, + {next_op}, + {skip_op, next_op}, + {next_op, skip_op}, + {skip_op, skip_op, next_op, next_op}, + {next_op, next_op, skip_op, skip_op}, + {skip_op, skip_op, skip_op, next_op, next_op}, + }; + for (int k = 0; k < orders.length; k++) { + final int order[] = orders[k]; + //System.out.print("Order:");for (int i = 0; i < order.length; i++) System.out.print(order[i]==skip_op ? " skip()":" next()"); System.out.println(); + final int opidx[] = {0}; + + final Weight w = q.weight(s); + final Scorer scorer = w.scorer(s.getIndexReader()); + + // FUTURE: ensure scorer.doc()==-1 + + final int[] sdoc = new int[] {-1}; + final float maxDiff = 1e-5f; + s.search(q,new HitCollector() { + public void collect(int doc, float score) { + try { + int op = order[(opidx[0]++)%order.length]; + //System.out.println(op==skip_op ? "skip("+(sdoc[0]+1)+")":"next()"); + boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next(); + sdoc[0] = scorer.doc(); + float scorerScore = scorer.score(); + float scoreDiff = Math.abs(score-scorerScore); + if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) { + StringBuffer sbord = new StringBuffer(); + for (int i = 0; i < order.length; i++) + sbord.append(order[i]==skip_op ? " skip()":" next()"); + throw new RuntimeException("ERROR matching docs:" + +"\n\tscorer.more=" + more + " doc="+sdoc[0] + " score="+scorerScore + +"\n\thitCollector.doc=" + doc + " score="+score + +"\n\t Scorer=" + scorer + +"\n\t Query=" + q + +"\n\t Searcher=" + s + +"\n\t Order=" + sbord + ); + } + } catch (IOException e) { + throw new RuntimeException(e); } - } catch (IOException e) { - throw new RuntimeException(e); } - } - }); - - // make sure next call to scorer is false. - TestCase.assertFalse((which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next()); + }); + + // make sure next call to scorer is false. + int op = order[(opidx[0]++)%order.length]; + //System.out.println(op==skip_op ? "last: skip()":"last: next()"); + boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next(); + TestCase.assertFalse(more); + } } }