Index: lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (revision 1099461) +++ lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (working copy) @@ -17,17 +17,18 @@ * limitations under the License. */ +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; @@ -412,7 +413,7 @@ mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0); } TopDocs hits = s.search(mpq, 2); - assert hits.totalHits == 2; + assertEquals(2, hits.totalHits); assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5); /* for(int hit=0;hit- Put in repeats[] each pp that has another pp with same position in the doc. + * This relies on that the position in PP is computed as (TP.position - offset) and + * so by adding offset we actually compare positions and identify that the two are + * the same term. + * An exclusion to this is two distinct terms in the same offset in query and same + * position in doc. This case is detected by comparing just the (query) offsets, + * and two such PPs are not considered "repeating". *
- Also mark each such pp by pp.repeats = true. *
Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. * In particular, this allows to score queries with no repetitions with no overhead due to this computation. @@ -145,23 +151,26 @@ if (!checkedRepeats) { checkedRepeats = true; // check for repeats - HashMap m = null; + HashSet m = null; for (PhrasePositions pp = first; pp != null; pp = pp.next) { int tpPos = pp.position + pp.offset; for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { + if (pp.offset == pp2.offset) { + continue; // not a repetition: the two PPs are originally in same offset in the query! + } int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) { if (m == null) - m = new HashMap(); + m = new HashSet(); pp.repeats = true; pp2.repeats = true; - m.put(pp,null); - m.put(pp2,null); + m.add(pp); + m.add(pp2); } } } if (m!=null) - repeats = m.keySet().toArray(new PhrasePositions[0]); + repeats = m.toArray(new PhrasePositions[0]); } // with repeats must advance some repeating pp's so they all start with differing tp's @@ -204,11 +213,16 @@ int tpPos = pp.position + pp.offset; for (int i = 0; i < repeats.length; i++) { PhrasePositions pp2 = repeats[i]; - if (pp2 == pp) + if (pp2 == pp) { continue; + } + if (pp.offset == pp2.offset) { + continue; // not a repetition: the two PPs are originally in same offset in the query! + } int tpPos2 = pp2.position + pp2.offset; - if (tpPos2 == tpPos) + if (tpPos2 == tpPos) { return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset. + } } return null; }