Index: src/java/org/apache/lucene/search/FuzzyQuery.java
===================================================================
--- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 888603)
+++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy)
@@ -24,76 +24,20 @@
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
-import java.io.Serializable;
-import java.util.PriorityQueue;
/** Implements the fuzzy search query. The similarity measurement
* is based on the Levenshtein (edit distance) algorithm.
*
- * Warning: this query is not very scalable with its default prefix
+ *
Warning: this query is not very scalable with its default prefix
* length of 0 - in this case, *every* term will be enumerated and
* cause an edit score calculation.
*
+ *
This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE)
+ * as default. So terms will be collected and scored according to their
+ * edit distance. Only the top terms are used for building the {@link BooleanQuery}.
*/
public class FuzzyQuery extends MultiTermQuery {
- private static class FuzzyRewrite extends RewriteMethod implements Serializable {
- @Override
- public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
- int maxSize = BooleanQuery.getMaxClauseCount();
- PriorityQueue stQueue = new PriorityQueue(1024);
-
- TermsEnum termsEnum = query.getTermsEnum(reader);
- assert termsEnum != null;
- final String field = query.field;
- if (field == null)
- throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
- final MultiTermQuery.BoostAttribute boostAtt =
- termsEnum.attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
- ScoreTerm bottomSt = null;
- TermRef t;
- final Term placeholderTerm = new Term(field);
- while ((t = termsEnum.next()) != null) {
- if (t == null) break;
- ScoreTerm st = new ScoreTerm(placeholderTerm.createTerm(t.toString()), boostAtt.getBoost());
- if (stQueue.size() < maxSize) {
- // record the current bottom item
- if (bottomSt == null || st.compareTo(bottomSt) > 0) {
- bottomSt = st;
- }
- // add to PQ, as it is not yet filled up
- stQueue.offer(st);
- } else {
- assert bottomSt != null;
- // only add to PQ, if the ScoreTerm is greater than the current bottom,
- // as all entries will be enqueued after the current bottom and will never be visible
- if (st.compareTo(bottomSt) < 0) {
- stQueue.offer(st);
- }
- }
- //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
- }
-
- BooleanQuery bq = new BooleanQuery(true);
- int size = Math.min(stQueue.size(), maxSize);
- for(int i = 0; i < size; i++){
- ScoreTerm st = stQueue.poll();
- TermQuery tq = new TermQuery(st.term); // found a match
- tq.setBoost(query.getBoost() * st.score); // set the boost
- bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
- }
- query.incTotalNumberOfTerms(bq.clauses().size());
- return bq;
- }
-
- // Make sure we are still a singleton even after deserializing
- protected Object readResolve() {
- return FUZZY_REWRITE;
- }
- }
-
- private final static RewriteMethod FUZZY_REWRITE = new FuzzyRewrite();
-
public final static float defaultMinSimilarity = 0.5f;
public final static int defaultPrefixLength = 0;
@@ -122,6 +66,7 @@
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
super(term.field());
this.term = term;
+ setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE);
if (minimumSimilarity >= 1.0f)
throw new IllegalArgumentException("minimumSimilarity >= 1");
@@ -136,7 +81,6 @@
this.minimumSimilarity = minimumSimilarity;
this.prefixLength = prefixLength;
- rewriteMethod = FUZZY_REWRITE;
}
/**
@@ -192,28 +136,16 @@
public Term getTerm() {
return term;
}
-
- @Override
- public void setRewriteMethod(RewriteMethod method) {
- throw new UnsupportedOperationException("FuzzyQuery cannot change rewrite method");
- }
- protected static class ScoreTerm implements Comparable {
- public Term term;
- public float score;
-
+ /**
+ * @deprecated This class was used in previous FuzzyQuery implementations, but is now replaced by
+ * a new rewrite mode {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}.
+ */
+ @Deprecated
+ protected static class ScoreTerm extends MultiTermQuery.TopTermsScoringBooleanQueryRewrite.ScoreTerm {
public ScoreTerm(Term term, float score){
- this.term = term;
- this.score = score;
+ super(term,score);
}
-
- public int compareTo(ScoreTerm other) {
- if (this.score == other.score)
- return this.term.compareTo(other.term);
- else
- // inverse ordering!!!
- return Float.compare(other.score, this.score);
- }
}
@Override
Index: src/java/org/apache/lucene/search/MultiTermQuery.java
===================================================================
--- src/java/org/apache/lucene/search/MultiTermQuery.java (revision 888603)
+++ src/java/org/apache/lucene/search/MultiTermQuery.java (working copy)
@@ -21,8 +21,8 @@
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.PriorityQueue;
-
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermRef;
@@ -119,7 +119,7 @@
public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
}
- private static final class ConstantScoreFilterRewrite extends RewriteMethod implements Serializable {
+ private static final class ConstantScoreFilterRewrite extends RewriteMethod {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) {
Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
@@ -147,53 +147,69 @@
* @see #setRewriteMethod */
public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite();
- private static class ScoringBooleanQueryRewrite extends RewriteMethod implements Serializable {
- @Override
- public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
-
+ private abstract static class BooleanQueryRewrite extends RewriteMethod {
+
+ protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
final TermsEnum termsEnum = query.getTermsEnum(reader);
if (termsEnum != null) {
final BoostAttribute boostAtt =
termsEnum.attributes().addAttribute(BoostAttribute.class);
-
- // nocommit -- if no terms we'd want to return NullQuery
- BooleanQuery result = new BooleanQuery(true);
- final String field = query.field;
- if (field == null)
+ if (query.field == null)
throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
int count = 0;
TermRef term;
- final Term placeholderTerm = new Term(field);
+ final Term placeholderTerm = new Term(query.field);
while ((term = termsEnum.next()) != null) {
- TermQuery tq = new TermQuery(placeholderTerm.createTerm(term.toString())); // found a match
- tq.setBoost(query.getBoost() * boostAtt.getBoost()); // set the boost
- result.add(tq, BooleanClause.Occur.SHOULD); // add to query
- count++;
+ if (collector.collect(placeholderTerm.createTerm(term.toString()), boostAtt.getBoost())) {
+ count++;
+ } else {
+ break;
+ }
}
- query.incTotalNumberOfTerms(count);
- return result;
+ return count;
} else {
// deprecated case
final FilteredTermEnum enumerator = query.getEnum(reader);
- BooleanQuery result = new BooleanQuery(true);
int count = 0;
try {
do {
Term t = enumerator.term();
if (t != null) {
- TermQuery tq = new TermQuery(t); // found a match
- tq.setBoost(query.getBoost() * enumerator.difference()); // set the boost
- result.add(tq, BooleanClause.Occur.SHOULD); // add to query
- count++;
+ if (collector.collect(t, enumerator.difference())) {
+ count++;
+ } else {
+ break;
+ }
}
} while (enumerator.next());
} finally {
enumerator.close();
}
- query.incTotalNumberOfTerms(count);
- return result;
+ return count;
}
}
+
+ protected interface TermCollector {
+ /** return false to stop collecting */
+ boolean collect(Term t, float boost) throws IOException;
+ }
+
+ }
+
+ private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+ @Override
+ public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
+ final BooleanQuery result = new BooleanQuery(true);
+ query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
+ public boolean collect(Term t, float boost) {
+ TermQuery tq = new TermQuery(t); // found a match
+ tq.setBoost(query.getBoost() * boost); // set the boost
+ result.add(tq, BooleanClause.Occur.SHOULD); // add to query
+ return true;
+ }
+ }));
+ return result;
+ }
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
@@ -216,6 +232,90 @@
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
+ // make this private in 4.0 when FuzzyQuery no longer subclasses the ScoreTerm:
+ static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+ @Override
+ public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
+ final int maxSize = BooleanQuery.getMaxClauseCount();
+ final PriorityQueue stQueue = new PriorityQueue(1024);
+
+ collectTerms(reader, query, new TermCollector() {
+ public boolean collect(Term t, float boost) {
+ final ScoreTerm st = new ScoreTerm(t, boost);
+ if (stQueue.size() < maxSize) {
+ // record the current bottom item
+ if (bottomSt == null || st.compareTo(bottomSt) > 0) {
+ bottomSt = st;
+ }
+ // add to PQ, as it is not yet filled up
+ stQueue.offer(st);
+ } else {
+ assert bottomSt != null;
+ // only add to PQ, if the ScoreTerm is greater than the current bottom,
+ // as all entries will be enqueued after the current bottom and will never be visible
+ if (st.compareTo(bottomSt) < 0) {
+ stQueue.offer(st);
+ }
+ }
+ //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
+ return true;
+ }
+
+ private ScoreTerm bottomSt = null;
+ });
+
+ BooleanQuery bq = new BooleanQuery(true);
+ int size = Math.min(stQueue.size(), maxSize);
+ for(int i = 0; i < size; i++){
+ ScoreTerm st = stQueue.poll();
+ TermQuery tq = new TermQuery(st.term); // found a match
+ tq.setBoost(query.getBoost() * st.score); // set the boost
+ bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
+ }
+ query.incTotalNumberOfTerms(bq.clauses().size());
+ return bq;
+ }
+
+ // Make sure we are still a singleton even after deserializing
+ protected Object readResolve() {
+ return TOP_TERMS_SCORING_BOOLEAN_REWRITE;
+ }
+
+ // make this private in 4.0 when FuzzyQuery no longer subclasses this:
+ static class ScoreTerm implements Comparable {
+ public Term term;
+ public float score;
+
+ public ScoreTerm(Term term, float score){
+ this.term = term;
+ this.score = score;
+ }
+
+ public int compareTo(ScoreTerm other) {
+ if (this.score == other.score)
+ return this.term.compareTo(other.term);
+ else
+ // inverse ordering!!!
+ return Float.compare(other.score, this.score);
+ }
+ }
+ }
+
+ /** A rewrite method that first translates each term into
+ * {@link BooleanClause.Occur#SHOULD} clause in a
+ * BooleanQuery, and keeps the scores as computed by the
+ * query. Note that typically such scores are
+ * meaningless to the user, and require non-trivial CPU
+ * to compute, so it's almost always better to use {@link
+ * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead.
+ *
+ * This rewrite mode only uses the top scoring terms
+ * so it will not overflow the boolean max clause count.
+ * It is the default rewrite mode for {@link FuzzyQuery}.
+ *
+ * @see #setRewriteMethod */
+ public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite();
+
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
@@ -258,7 +358,7 @@
* Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is
* used.
*/
- public static class ConstantScoreAutoRewrite extends RewriteMethod implements Serializable {
+ public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite {
// Defaults derived from rough tests with a 20.0 million
// doc Wikipedia index. With more than 350 terms in the
@@ -299,7 +399,7 @@
}
@Override
- public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
+ public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
// Get the enum and start visiting terms. If we
// exhaust the enum before hitting either of the
@@ -307,100 +407,61 @@
// ConstantFilterRewrite:
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
- int docVisitCount = 0;
- TermsEnum termsEnum = query.getTermsEnum(reader);
- if (termsEnum != null) {
- final Collection pendingTerms = new ArrayList();
- final String field = query.field;
- if (field == null)
- throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
- TermRef term;
- while ((term = termsEnum.next()) != null) {
- pendingTerms.add((TermRef) term.clone());
- if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
- // Too many terms -- cut our losses now and make a filter.
- Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
- result.setBoost(query.getBoost());
- return result;
- }
- // Loading the TermInfo from the terms dict here
- // should not be costly, because 1) the
- // query/filter will load the TermInfo when it
- // runs, and 2) the terms dict has a cache:
- docVisitCount += reader.docFreq(field, term);
- }
+ final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit);
+ collectTerms(reader, query, col);
- // Enumeration is done, and we hit a small
- // enough number of terms & docs -- just make a
- // BooleanQuery, now
-
- // nocommit: if pendingTerms.size()==0 return NullQuery
+ if (col.hasCutOff) {
+ return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
+ } else {
final Query result;
- if (pendingTerms.isEmpty()) {
+ if (col.pendingTerms.isEmpty()) {
result = new BooleanQuery(true);
} else {
BooleanQuery bq = new BooleanQuery(true);
- final Term placeholderTerm = new Term(field);
- for(TermRef termRef : pendingTerms) {
- TermQuery tq = new TermQuery(placeholderTerm.createTerm(termRef.toString()));
+ for(Term term : col.pendingTerms) {
+ TermQuery tq = new TermQuery(term);
bq.add(tq, BooleanClause.Occur.SHOULD);
}
// Strip scores
result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
}
- query.incTotalNumberOfTerms(pendingTerms.size());
+ query.incTotalNumberOfTerms(col.pendingTerms.size());
return result;
- } else {
- final Collection pendingTerms = new ArrayList();
-
- // deprecated case
- FilteredTermEnum enumerator = query.getEnum(reader);
- try {
- while(true) {
- Term t = enumerator.term();
- if (t != null) {
- pendingTerms.add(t);
- // Loading the TermInfo from the terms dict here
- // should not be costly, because 1) the
- // query/filter will load the TermInfo when it
- // runs, and 2) the terms dict has a cache:
- docVisitCount += reader.docFreq(t);
- }
-
- if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
- // Too many terms -- make a filter.
- Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
- result.setBoost(query.getBoost());
- return result;
- } else if (!enumerator.next()) {
- // Enumeration is done, and we hit a small
- // enough number of terms & docs -- just make a
- // BooleanQuery, now
- final Query result;
- if (pendingTerms.isEmpty()) {
- result = new BooleanQuery(true);
- } else {
- BooleanQuery bq = new BooleanQuery(true);
- for(Term term : pendingTerms) {
- TermQuery tq = new TermQuery(term);
- bq.add(tq, BooleanClause.Occur.SHOULD);
- }
- // Strip scores
- result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
- result.setBoost(query.getBoost());
- }
- query.incTotalNumberOfTerms(pendingTerms.size());
- return result;
- }
- }
- } finally {
- enumerator.close();
+ }
+ }
+
+ private static final class CutOffTermCollector implements TermCollector {
+ CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) {
+ this.reader = reader;
+ this.docCountCutoff = docCountCutoff;
+ this.termCountLimit = termCountLimit;
+ }
+
+ public boolean collect(Term t, float boost) throws IOException {
+ pendingTerms.add(t);
+ if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
+ hasCutOff = true;
+ return false;
}
+ // Loading the TermInfo from the terms dict here
+ // should not be costly, because 1) the
+ // query/filter will load the TermInfo when it
+ // runs, and 2) the terms dict has a cache:
+ // @deprecated: in 4.0 use TermRef for collectTerms()
+ docVisitCount += reader.docFreq(t);
+ return true;
}
+
+ int docVisitCount = 0;
+ boolean hasCutOff = false;
+
+ final IndexReader reader;
+ final int docCountCutoff, termCountLimit;
+ final ArrayList pendingTerms = new ArrayList();
}
-
+
@Override
public int hashCode() {
final int prime = 1279;