Index: src/java/org/apache/lucene/search/FuzzyQuery.java =================================================================== --- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 888603) +++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) @@ -24,8 +24,6 @@ import org.apache.lucene.util.ToStringUtils; import java.io.IOException; -import java.io.Serializable; -import java.util.PriorityQueue; /** Implements the fuzzy search query. The similarity measurement * is based on the Levenshtein (edit distance) algorithm. @@ -37,63 +35,6 @@ */ public class FuzzyQuery extends MultiTermQuery { - private static class FuzzyRewrite extends RewriteMethod implements Serializable { - @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - int maxSize = BooleanQuery.getMaxClauseCount(); - PriorityQueue stQueue = new PriorityQueue(1024); - - TermsEnum termsEnum = query.getTermsEnum(reader); - assert termsEnum != null; - final String field = query.field; - if (field == null) - throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); - final MultiTermQuery.BoostAttribute boostAtt = - termsEnum.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); - ScoreTerm bottomSt = null; - TermRef t; - final Term placeholderTerm = new Term(field); - while ((t = termsEnum.next()) != null) { - if (t == null) break; - ScoreTerm st = new ScoreTerm(placeholderTerm.createTerm(t.toString()), boostAtt.getBoost()); - if (stQueue.size() < maxSize) { - // record the current bottom item - if (bottomSt == null || st.compareTo(bottomSt) > 0) { - bottomSt = st; - } - // add to PQ, as it is not yet filled up - stQueue.offer(st); - } else { - assert bottomSt != null; - // only add to PQ, if the ScoreTerm is greater than the current bottom, - // as all entries will be enqueued after the current bottom and will never be visible - if (st.compareTo(bottomSt) < 0) { - stQueue.offer(st); - } - } - //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")"); - } - - BooleanQuery bq = new BooleanQuery(true); - int size = Math.min(stQueue.size(), maxSize); - for(int i = 0; i < size; i++){ - ScoreTerm st = stQueue.poll(); - TermQuery tq = new TermQuery(st.term); // found a match - tq.setBoost(query.getBoost() * st.score); // set the boost - bq.add(tq, BooleanClause.Occur.SHOULD); // add to query - } - query.incTotalNumberOfTerms(bq.clauses().size()); - return bq; - } - - // Make sure we are still a singleton even after deserializing - protected Object readResolve() { - return FUZZY_REWRITE; - } - } - - private final static RewriteMethod FUZZY_REWRITE = new FuzzyRewrite(); - public final static float defaultMinSimilarity = 0.5f; public final static int defaultPrefixLength = 0; @@ -122,6 +63,7 @@ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException { super(term.field()); this.term = term; + setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE); if (minimumSimilarity >= 1.0f) throw new IllegalArgumentException("minimumSimilarity >= 1"); @@ -136,7 +78,6 @@ this.minimumSimilarity = minimumSimilarity; this.prefixLength = prefixLength; - rewriteMethod = FUZZY_REWRITE; } /** @@ -192,28 +133,16 @@ public Term getTerm() { return term; } - - @Override - public void setRewriteMethod(RewriteMethod method) { - throw new UnsupportedOperationException("FuzzyQuery cannot change rewrite method"); - } - protected static class ScoreTerm implements Comparable { - public Term term; - public float score; - + /** + * @deprecated This class was used in previous FuzzyQuery implementations, but is now replaced by + * a new rewrite mode {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}. + */ + @Deprecated + protected static class ScoreTerm extends MultiTermQuery.TopTermsScoringBooleanQueryRewrite.ScoreTerm { public ScoreTerm(Term term, float score){ - this.term = term; - this.score = score; + super(term,score); } - - public int compareTo(ScoreTerm other) { - if (this.score == other.score) - return this.term.compareTo(other.term); - else - // inverse ordering!!! - return Float.compare(other.score, this.score); - } } @Override Index: src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQuery.java (revision 888603) +++ src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -21,8 +21,8 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; +import java.util.PriorityQueue; - import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermRef; @@ -119,7 +119,7 @@ public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; } - private static final class ConstantScoreFilterRewrite extends RewriteMethod implements Serializable { + private static final class ConstantScoreFilterRewrite extends RewriteMethod { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) { Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); @@ -147,53 +147,69 @@ * @see #setRewriteMethod */ public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite(); - private static class ScoringBooleanQueryRewrite extends RewriteMethod implements Serializable { - @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - + private abstract static class BooleanQueryRewrite extends RewriteMethod { + + protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { final TermsEnum termsEnum = query.getTermsEnum(reader); if (termsEnum != null) { final BoostAttribute boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); - - // nocommit -- if no terms we'd want to return NullQuery - BooleanQuery result = new BooleanQuery(true); - final String field = query.field; - if (field == null) + if (query.field == null) throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); int count = 0; TermRef term; - final Term placeholderTerm = new Term(field); + final Term placeholderTerm = new Term(query.field); while ((term = termsEnum.next()) != null) { - TermQuery tq = new TermQuery(placeholderTerm.createTerm(term.toString())); // found a match - tq.setBoost(query.getBoost() * boostAtt.getBoost()); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query - count++; + if (collector.collect(placeholderTerm.createTerm(term.toString()), boostAtt.getBoost())) { + count++; + } else { + break; + } } - query.incTotalNumberOfTerms(count); - return result; + return count; } else { // deprecated case final FilteredTermEnum enumerator = query.getEnum(reader); - BooleanQuery result = new BooleanQuery(true); int count = 0; try { do { Term t = enumerator.term(); if (t != null) { - TermQuery tq = new TermQuery(t); // found a match - tq.setBoost(query.getBoost() * enumerator.difference()); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query - count++; + if (collector.collect(t, enumerator.difference())) { + count++; + } else { + break; + } } } while (enumerator.next()); } finally { enumerator.close(); } - query.incTotalNumberOfTerms(count); - return result; + return count; } } + + protected interface TermCollector { + /** return false to stop collecting */ + boolean collect(Term t, float boost) throws IOException; + } + + } + + private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { + @Override + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + final BooleanQuery result = new BooleanQuery(true); + query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { + public boolean collect(Term t, float boost) { + TermQuery tq = new TermQuery(t); // found a match + tq.setBoost(query.getBoost() * boost); // set the boost + result.add(tq, BooleanClause.Occur.SHOULD); // add to query + return true; + } + })); + return result; + } // Make sure we are still a singleton even after deserializing protected Object readResolve() { @@ -216,6 +232,90 @@ * @see #setRewriteMethod */ public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite(); + // make this private in 4.0 when FuzzyQuery no longer subclasses the ScoreTerm: + static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite { + @Override + public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + final int maxSize = BooleanQuery.getMaxClauseCount(); + final PriorityQueue stQueue = new PriorityQueue(1024); + + collectTerms(reader, query, new TermCollector() { + public boolean collect(Term t, float boost) { + final ScoreTerm st = new ScoreTerm(t, boost); + if (stQueue.size() < maxSize) { + // record the current bottom item + if (bottomSt == null || st.compareTo(bottomSt) > 0) { + bottomSt = st; + } + // add to PQ, as it is not yet filled up + stQueue.offer(st); + } else { + assert bottomSt != null; + // only add to PQ, if the ScoreTerm is greater than the current bottom, + // as all entries will be enqueued after the current bottom and will never be visible + if (st.compareTo(bottomSt) < 0) { + stQueue.offer(st); + } + } + //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")"); + return true; + } + + private ScoreTerm bottomSt = null; + }); + + BooleanQuery bq = new BooleanQuery(true); + int size = Math.min(stQueue.size(), maxSize); + for(int i = 0; i < size; i++){ + ScoreTerm st = stQueue.poll(); + TermQuery tq = new TermQuery(st.term); // found a match + tq.setBoost(query.getBoost() * st.score); // set the boost + bq.add(tq, BooleanClause.Occur.SHOULD); // add to query + } + query.incTotalNumberOfTerms(bq.clauses().size()); + return bq; + } + + // Make sure we are still a singleton even after deserializing + protected Object readResolve() { + return TOP_TERMS_SCORING_BOOLEAN_REWRITE; + } + + // make this private in 4.0 when FuzzyQuery no longer subclasses this: + static class ScoreTerm implements Comparable { + public Term term; + public float score; + + public ScoreTerm(Term term, float score){ + this.term = term; + this.score = score; + } + + public int compareTo(ScoreTerm other) { + if (this.score == other.score) + return this.term.compareTo(other.term); + else + // inverse ordering!!! + return Float.compare(other.score, this.score); + } + } + } + + /** A rewrite method that first translates each term into + * {@link BooleanClause.Occur#SHOULD} clause in a + * BooleanQuery, and keeps the scores as computed by the + * query. Note that typically such scores are + * meaningless to the user, and require non-trivial CPU + * to compute, so it's almost always better to use {@link + * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. + * + *

This rewrite mode only uses the top scoring terms + * so it will not overflow the boolean max clause count. + * It is the default rewrite mode for {@link FuzzyQuery}. + * + * @see #setRewriteMethod */ + public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite(); + private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { @@ -258,7 +358,7 @@ * Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is * used. */ - public static class ConstantScoreAutoRewrite extends RewriteMethod implements Serializable { + public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite { // Defaults derived from rough tests with a 20.0 million // doc Wikipedia index. With more than 350 terms in the @@ -299,7 +399,7 @@ } @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the @@ -307,100 +407,61 @@ // ConstantFilterRewrite: final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - int docVisitCount = 0; - TermsEnum termsEnum = query.getTermsEnum(reader); - if (termsEnum != null) { - final Collection pendingTerms = new ArrayList(); - final String field = query.field; - if (field == null) - throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); - TermRef term; - while ((term = termsEnum.next()) != null) { - pendingTerms.add((TermRef) term.clone()); - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { - // Too many terms -- cut our losses now and make a filter. - Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); - result.setBoost(query.getBoost()); - return result; - } - // Loading the TermInfo from the terms dict here - // should not be costly, because 1) the - // query/filter will load the TermInfo when it - // runs, and 2) the terms dict has a cache: - docVisitCount += reader.docFreq(field, term); - } + final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit); + collectTerms(reader, query, col); - // Enumeration is done, and we hit a small - // enough number of terms & docs -- just make a - // BooleanQuery, now - - // nocommit: if pendingTerms.size()==0 return NullQuery + if (col.hasCutOff) { + return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); + } else { final Query result; - if (pendingTerms.isEmpty()) { + if (col.pendingTerms.isEmpty()) { result = new BooleanQuery(true); } else { BooleanQuery bq = new BooleanQuery(true); - final Term placeholderTerm = new Term(field); - for(TermRef termRef : pendingTerms) { - TermQuery tq = new TermQuery(placeholderTerm.createTerm(termRef.toString())); + for(Term term : col.pendingTerms) { + TermQuery tq = new TermQuery(term); bq.add(tq, BooleanClause.Occur.SHOULD); } // Strip scores result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); } - query.incTotalNumberOfTerms(pendingTerms.size()); + query.incTotalNumberOfTerms(col.pendingTerms.size()); return result; - } else { - final Collection pendingTerms = new ArrayList(); - - // deprecated case - FilteredTermEnum enumerator = query.getEnum(reader); - try { - while(true) { - Term t = enumerator.term(); - if (t != null) { - pendingTerms.add(t); - // Loading the TermInfo from the terms dict here - // should not be costly, because 1) the - // query/filter will load the TermInfo when it - // runs, and 2) the terms dict has a cache: - docVisitCount += reader.docFreq(t); - } - - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { - // Too many terms -- make a filter. - Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); - result.setBoost(query.getBoost()); - return result; - } else if (!enumerator.next()) { - // Enumeration is done, and we hit a small - // enough number of terms & docs -- just make a - // BooleanQuery, now - final Query result; - if (pendingTerms.isEmpty()) { - result = new BooleanQuery(true); - } else { - BooleanQuery bq = new BooleanQuery(true); - for(Term term : pendingTerms) { - TermQuery tq = new TermQuery(term); - bq.add(tq, BooleanClause.Occur.SHOULD); - } - // Strip scores - result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - } - query.incTotalNumberOfTerms(pendingTerms.size()); - return result; - } - } - } finally { - enumerator.close(); + } + } + + private static final class CutOffTermCollector implements TermCollector { + CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { + this.reader = reader; + this.docCountCutoff = docCountCutoff; + this.termCountLimit = termCountLimit; + } + + public boolean collect(Term t, float boost) throws IOException { + pendingTerms.add(t); + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + hasCutOff = true; + return false; } + // Loading the TermInfo from the terms dict here + // should not be costly, because 1) the + // query/filter will load the TermInfo when it + // runs, and 2) the terms dict has a cache: + // @deprecated: in 4.0 use TermRef for collectTerms() + docVisitCount += reader.docFreq(t); + return true; } + + int docVisitCount = 0; + boolean hasCutOff = false; + + final IndexReader reader; + final int docCountCutoff, termCountLimit; + final ArrayList pendingTerms = new ArrayList(); } - + @Override public int hashCode() { final int prime = 1279;