Index: lucene/core/src/java/org/apache/lucene/index/TermContext.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/TermContext.java (revision 1678900) +++ lucene/core/src/java/org/apache/lucene/index/TermContext.java (working copy) @@ -117,16 +117,31 @@ * should be derived from a {@link IndexReaderContext}'s leaf ord. */ public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) { + register(state, ord); + accumulateStatistics(docFreq, totalTermFreq); + } + + /** + * Expert: Registers and associates a {@link TermState} with an leaf ordinal. The + * leaf ordinal should be derived from a {@link IndexReaderContext}'s leaf ord. + * On the contrary to {@link #register(TermState, int, int, long)} this method + * does NOT update term statistics. + */ + public void register(TermState state, final int ord) { assert state != null : "state must not be null"; assert ord >= 0 && ord < states.length; assert states[ord] == null : "state for ord: " + ord + " already registered"; + states[ord] = state; + } + + /** Expert: Accumulate term statistics. */ + public void accumulateStatistics(final int docFreq, final long totalTermFreq) { this.docFreq += docFreq; if (this.totalTermFreq >= 0 && totalTermFreq >= 0) this.totalTermFreq += totalTermFreq; else this.totalTermFreq = -1; - states[ord] = state; } /** Index: lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java (revision 1678900) +++ lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) @@ -98,7 +98,7 @@ this.prefixLength = prefixLength; this.transpositions = transpositions; this.maxExpansions = maxExpansions; - setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions)); + setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions)); } /** Index: lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java (revision 1678900) +++ lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -113,7 +113,7 @@ * * @see #setRewriteMethod */ public final static RewriteMethod SCORING_BOOLEAN_REWRITE = ScoringRewrite.SCORING_BOOLEAN_REWRITE; - + /** Like {@link #SCORING_BOOLEAN_REWRITE} except * scores are not computed. Instead, each matching * document receives a constant score equal to the @@ -171,6 +171,85 @@ /** * A rewrite method that first translates each term into + * {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but adjusts + * the frequencies used for scoring to be blended across the terms, otherwise + * the rarest term typically ranks highest (often not useful eg in the set of + * expanded terms in a FuzzyQuery). + * + *

+ * This rewrite method only uses the top scoring terms so it will not overflow + * the boolean max clause count. + * + * @see #setRewriteMethod + */ + public static final class TopTermsBlendedFreqScoringRewrite extends + TopTermsRewrite { + + /** + * Create a TopTermsBlendedScoringBooleanQueryRewrite for at most + * size terms. + *

+ * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than + * size, then it will be used instead. + */ + public TopTermsBlendedFreqScoringRewrite(int size) { + super(size); + } + + @Override + protected int getMaxSize() { + return BooleanQuery.getMaxClauseCount(); + } + + @Override + protected BooleanQuery getTopLevelQuery() { + return new BooleanQuery(true); + } + + @Override + protected void addClause(BooleanQuery topLevel, Term term, int docCount, + float boost, TermContext states) { + final TermQuery tq = new TermQuery(term, states); + tq.setBoost(boost); + topLevel.add(tq, BooleanClause.Occur.SHOULD); + } + + @Override + void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) { + if (scoreTerms.length <= 1) { + return; + } + int maxDoc = reader.maxDoc(); + int maxDf = 0; + long maxTtf = 0; + for (ScoreTerm scoreTerm : scoreTerms) { + TermContext ctx = scoreTerm.termState; + int df = ctx.docFreq(); + maxDf = Math.max(df, maxDf); + long ttf = ctx.totalTermFreq(); + maxTtf = Math.max(ttf, maxTtf); + } + + if (maxDf == 0) { + return; // we are done that term doesn't exist at all + } + int artificialDf = Math.min(maxDoc, maxDf); + assert artificialDf >= 0 : "DF must be >= 0"; + + // Use a value of ttf that is consistent with the doc freq (ie. gte) + long artificialTtf = Math.min(maxTtf, maxDf); + + for (int i = 0; i < scoreTerms.length; i++) { + TermContext ctx = scoreTerms[i].termState; + //Add the appropriate increment to increase df and ttf to the chosen artificial values + ctx.accumulateStatistics(artificialDf - ctx.docFreq(), artificialTtf + - ctx.totalTermFreq()); + } + } + } + + /** + * A rewrite method that first translates each term into * {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but the scores * are only computed as the boost. *

Index: lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java (revision 1678900) +++ lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java (working copy) @@ -18,10 +18,10 @@ */ import java.io.IOException; +import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.PriorityQueue; -import java.util.Comparator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -158,14 +158,19 @@ final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp); + adjustScoreTerms(reader, scoreTerms); + for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes.toBytesRef()); - assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } return q; } + void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) { + //no-op but allows subclasses the ability to tweak the score terms used in ranking e.g. balancing IDF. + } + @Override public int hashCode() { return 31 * size; Index: lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 1678900) +++ lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy) @@ -17,9 +17,9 @@ * limitations under the License. */ +import java.io.IOException; +import java.util.Arrays; import java.util.List; -import java.util.Arrays; -import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -28,7 +28,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.automaton.LevenshteinAutomata; @@ -241,6 +244,89 @@ directory.close(); } + public void testSingleQueryExactMatchScoresHighest() throws Exception { + //See issue LUCENE-329 - IDF shouldn't wreck similarity ranking + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("smith", writer); + addDoc("smith", writer); + addDoc("smith", writer); + addDoc("smith", writer); + addDoc("smith", writer); + addDoc("smith", writer); + addDoc("smythe", writer); + addDoc("smdssasd", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework + writer.close(); + String searchTerms[] = { "smith", "smythe", "smdssasd" }; + for (String searchTerm : searchTerms) { + FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1); + ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; + StoredDocument bestDoc = searcher.doc(hits[0].doc); + assertTrue(hits.length > 0); + String topMatch = bestDoc.get("field"); + assertEquals(searchTerm, topMatch); + if (hits.length > 1) { + StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc); + String worstMatch = worstDoc.get("field"); + assertNotSame(searchTerm, worstMatch); + } + } + reader.close(); + directory.close(); + } + + public void testMultipleQueriesIdfWorks() throws Exception { + // With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite + // is the solution as it disables IDF. + // However - IDF is still useful as in this case where there are multiple FuzzyQueries. + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("john smith", writer); + addDoc("john lucero", writer); + addDoc("doug cutting", writer); + addDoc("doug cuttin", writer); + addDoc("john wardle", writer); + addDoc("john vegas", writer); + addDoc("john lydon", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework + + writer.close(); + + BooleanQuery query = new BooleanQuery(); + String commonSearchTerm = "john"; + FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1); + query.add(commonQuery, Occur.SHOULD); + + String rareSearchTerm = "cutting"; + FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1); + query.add(rareQuery, Occur.SHOULD); + ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; + + // Matches on the rare surname should be worth more than matches on the common forename + assertEquals(7, hits.length); + StoredDocument bestDoc = searcher.doc(hits[0].doc); + String topMatch = bestDoc.get("field"); + assertTrue(topMatch.contains(rareSearchTerm)); + + StoredDocument runnerUpDoc = searcher.doc(hits[1].doc); + String runnerUpMatch = runnerUpDoc.get("field"); + assertTrue(runnerUpMatch.contains("cuttin")); + + StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc); + String worstMatch = worstDoc.get("field"); + assertTrue(worstMatch.contains(commonSearchTerm)); + + reader.close(); + directory.close(); + } + /** * MultiTermQuery provides (via attribute) information about which values * must be competitive to enter the priority queue.