Index: src/java/org/apache/lucene/search/FuzzyQuery.java =================================================================== --- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 888622) +++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) @@ -22,15 +22,17 @@ import org.apache.lucene.util.ToStringUtils; import java.io.IOException; -import java.util.PriorityQueue; /** Implements the fuzzy search query. The similarity measurement * is based on the Levenshtein (edit distance) algorithm. * - * Warning: this query is not very scalable with its default prefix + *

Warning: this query is not very scalable with its default prefix * length of 0 - in this case, *every* term will be enumerated and * cause an edit score calculation. * + *

This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE) + * as default. So terms will be collected and scored according to their + * edit distance. Only the top terms are used for building the {@link BooleanQuery}. */ public class FuzzyQuery extends MultiTermQuery { @@ -61,6 +63,7 @@ */ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException { this.term = term; + setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE); if (minimumSimilarity >= 1.0f) throw new IllegalArgumentException("minimumSimilarity >= 1"); @@ -75,14 +78,13 @@ this.minimumSimilarity = minimumSimilarity; this.prefixLength = prefixLength; - rewriteMethod = SCORING_BOOLEAN_QUERY_REWRITE; } /** * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}. */ public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException { - this(term, minimumSimilarity, defaultPrefixLength); + this(term, minimumSimilarity, defaultPrefixLength); } /** @@ -111,6 +113,9 @@ @Override protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + if (!termLongEnough) { // can only match if it's exact + return new SingleTermEnum(reader, term); + } return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength); } @@ -120,60 +125,12 @@ public Term getTerm() { return term; } - - @Override - public void setRewriteMethod(RewriteMethod method) { - throw new UnsupportedOperationException("FuzzyQuery cannot change rewrite method"); - } - @Override - public Query rewrite(IndexReader reader) throws IOException { - if(!termLongEnough) { // can only match if it's exact - return new TermQuery(term); - } - - int maxSize = BooleanQuery.getMaxClauseCount(); - PriorityQueue stQueue = new PriorityQueue(1024); - FilteredTermEnum enumerator = getEnum(reader); - try { - ScoreTerm bottomSt = null; - do { - final Term t = enumerator.term(); - if (t == null) break; - ScoreTerm st = new ScoreTerm(t, enumerator.difference()); - if (stQueue.size() < maxSize) { - // record the current bottom item - if (bottomSt == null || st.compareTo(bottomSt) > 0) { - bottomSt = st; - } - // add to PQ, as it is not yet filled up - stQueue.offer(st); - } else { - assert bottomSt != null; - // only add to PQ, if the ScoreTerm is greater than the current bottom, - // as all entries will be enqueued after the current bottom and will never be visible - if (st.compareTo(bottomSt) < 0) { - stQueue.offer(st); - } - } - //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")"); - } while (enumerator.next()); - } finally { - enumerator.close(); - } - - BooleanQuery query = new BooleanQuery(true); - int size = Math.min(stQueue.size(), maxSize); - for(int i = 0; i < size; i++){ - ScoreTerm st = stQueue.poll(); - TermQuery tq = new TermQuery(st.term); // found a match - tq.setBoost(getBoost() * st.score); // set the boost - query.add(tq, BooleanClause.Occur.SHOULD); // add to query - } - - return query; - } - + /** + * @deprecated This class was used in previous FuzzyQuery implementations, but is now replaced by + * a new rewrite mode {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}. + */ + @Deprecated protected static class ScoreTerm implements Comparable { public Term term; public float score; Index: src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQuery.java (revision 888622) +++ src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -21,8 +21,8 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; +import java.util.PriorityQueue; - import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -66,7 +66,7 @@ public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; } - private static final class ConstantScoreFilterRewrite extends RewriteMethod implements Serializable { + private static final class ConstantScoreFilterRewrite extends RewriteMethod { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) { Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); @@ -94,27 +94,47 @@ * @see #setRewriteMethod */ public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite(); - private static class ScoringBooleanQueryRewrite extends RewriteMethod implements Serializable { - @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - - FilteredTermEnum enumerator = query.getEnum(reader); - BooleanQuery result = new BooleanQuery(true); + private abstract static class BooleanQueryRewrite extends RewriteMethod { + + protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { + final FilteredTermEnum enumerator = query.getEnum(reader); int count = 0; try { do { Term t = enumerator.term(); if (t != null) { - TermQuery tq = new TermQuery(t); // found a match - tq.setBoost(query.getBoost() * enumerator.difference()); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query - count++; + if (collector.collect(t, enumerator.difference())) { + count++; + } else { + break; + } } } while (enumerator.next()); } finally { enumerator.close(); } - query.incTotalNumberOfTerms(count); + return count; + } + + protected interface TermCollector { + /** return false to stop collecting */ + boolean collect(Term t, float boost) throws IOException; + } + + } + + private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { + @Override + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + final BooleanQuery result = new BooleanQuery(true); + query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { + public boolean collect(Term t, float boost) { + TermQuery tq = new TermQuery(t); // found a match + tq.setBoost(query.getBoost() * boost); // set the boost + result.add(tq, BooleanClause.Occur.SHOULD); // add to query + return true; + } + })); return result; } @@ -139,12 +159,83 @@ * @see #setRewriteMethod */ public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite(); + private static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite { + @Override + public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + final int maxSize = BooleanQuery.getMaxClauseCount(); + final PriorityQueue stQueue = new PriorityQueue(); + //System.out.println("Query: "+query); + collectTerms(reader, query, new TermCollector() { + public boolean collect(Term t, float boost) { + st.term = t; + st.boost = boost; + stQueue.offer(st); + if (stQueue.size() > maxSize) { + st = stQueue.poll(); + } else { + st = new ScoreTerm(); + } + //System.out.println("current: "+st.term+"("+st.boost+")"); + return true; + } + + // reusable instance + private ScoreTerm st = new ScoreTerm(); + }); + + BooleanQuery bq = new BooleanQuery(true); + for (final ScoreTerm st : stQueue) { + TermQuery tq = new TermQuery(st.term); // found a match + tq.setBoost(query.getBoost() * st.boost); // set the boost + bq.add(tq, BooleanClause.Occur.SHOULD); // add to query + } + query.incTotalNumberOfTerms(bq.clauses().size()); + return bq; + } + + // Make sure we are still a singleton even after deserializing + protected Object readResolve() { + return TOP_TERMS_SCORING_BOOLEAN_REWRITE; + } + + private static class ScoreTerm implements Comparable { + public Term term; + public float boost; + + public int compareTo(ScoreTerm other) { + if (this.boost == other.boost) + return other.term.compareTo(this.term); + else + return Float.compare(this.boost, other.boost); + } + } + } + + /** A rewrite method that first translates each term into + * {@link BooleanClause.Occur#SHOULD} clause in a + * BooleanQuery, and keeps the scores as computed by the + * query. Note that typically such scores are + * meaningless to the user, and require non-trivial CPU + * to compute, so it's almost always better to use {@link + * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. + * + *

This rewrite mode only uses the top scoring terms + * so it will not overflow the boolean max clause count. + * It is the default rewrite mode for {@link FuzzyQuery}. + * + * @see #setRewriteMethod */ + public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite(); + private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - // strip the scores off - Query result = new ConstantScoreQuery(new QueryWrapperFilter(super.rewrite(reader, query))); - result.setBoost(query.getBoost()); + Query result = super.rewrite(reader, query); + assert result instanceof BooleanQuery; + if (!((BooleanQuery) result).clauses().isEmpty()) { + // strip the scores off + result = new ConstantScoreQuery(new QueryWrapperFilter(result)); + result.setBoost(query.getBoost()); + } return result; } @@ -176,7 +267,7 @@ * Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is * used. */ - public static class ConstantScoreAutoRewrite extends RewriteMethod implements Serializable { + public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite { // Defaults derived from rough tests with a 20.0 million // doc Wikipedia index. With more than 350 terms in the @@ -217,55 +308,68 @@ } @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: - final Collection pendingTerms = new ArrayList(); final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - int docVisitCount = 0; - FilteredTermEnum enumerator = query.getEnum(reader); - try { - while(true) { - Term t = enumerator.term(); - if (t != null) { - pendingTerms.add(t); - // Loading the TermInfo from the terms dict here - // should not be costly, because 1) the - // query/filter will load the TermInfo when it - // runs, and 2) the terms dict has a cache: - docVisitCount += reader.docFreq(t); + final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit); + collectTerms(reader, query, col); + + if (col.hasCutOff) { + return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); + } else { + final Query result; + if (col.pendingTerms.isEmpty()) { + result = new BooleanQuery(true); + } else { + BooleanQuery bq = new BooleanQuery(true); + for(Term term : col.pendingTerms) { + TermQuery tq = new TermQuery(term); + bq.add(tq, BooleanClause.Occur.SHOULD); } - - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { - // Too many terms -- make a filter. - Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); - result.setBoost(query.getBoost()); - return result; - } else if (!enumerator.next()) { - // Enumeration is done, and we hit a small - // enough number of terms & docs -- just make a - // BooleanQuery, now - BooleanQuery bq = new BooleanQuery(true); - for (final Term term: pendingTerms) { - TermQuery tq = new TermQuery(term); - bq.add(tq, BooleanClause.Occur.SHOULD); - } - // Strip scores - Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(pendingTerms.size()); - return result; - } + // Strip scores + result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); } - } finally { - enumerator.close(); + query.incTotalNumberOfTerms(col.pendingTerms.size()); + return result; } } + private static final class CutOffTermCollector implements TermCollector { + CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { + this.reader = reader; + this.docCountCutoff = docCountCutoff; + this.termCountLimit = termCountLimit; + } + + public boolean collect(Term t, float boost) throws IOException { + pendingTerms.add(t); + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + hasCutOff = true; + return false; + } + // Loading the TermInfo from the terms dict here + // should not be costly, because 1) the + // query/filter will load the TermInfo when it + // runs, and 2) the terms dict has a cache: + docVisitCount += reader.docFreq(t); + return true; + } + + int docVisitCount = 0; + boolean hasCutOff = false; + + final IndexReader reader; + final int docCountCutoff, termCountLimit; + final ArrayList pendingTerms = new ArrayList(); + } + @Override public int hashCode() { final int prime = 1279;