Index: lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java =================================================================== --- lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (revision 1002153) +++ lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (working copy) @@ -87,7 +87,7 @@ assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString()); q = mfqp.parse("one~ two"); - assertEquals("(b:one~0.5 t:one~0.5) (b:two t:two)", q.toString()); + assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString()); q = mfqp.parse("one~0.8 two^2"); assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString()); @@ -274,7 +274,7 @@ q = parser.parse("bla*"); assertEquals("f1:bla* f2:bla* f3:bla*", q.toString()); q = parser.parse("bla~"); - assertEquals("f1:bla~0.5 f2:bla~0.5 f3:bla~0.5", q.toString()); + assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString()); q = parser.parse("[a TO c]"); assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString()); } Index: lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 1002153) +++ lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -431,10 +431,10 @@ public void testWildcard() throws Exception { assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*^2", null, "term*^2.0"); - assertQueryEquals("term~", null, "term~0.5"); + assertQueryEquals("term~", null, "term~2.0"); assertQueryEquals("term~0.7", null, "term~0.7"); - assertQueryEquals("term~^2", null, "term~0.5^2.0"); - assertQueryEquals("term^2~", null, "term~0.5^2.0"); + assertQueryEquals("term~^3", null, "term~2.0^3.0"); + assertQueryEquals("term^3~", null, "term~2.0^3.0"); assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ^3", null, "term*germ^3.0"); @@ -446,7 +446,7 @@ assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); fq = (FuzzyQuery)getQuery("term~", null); - assertEquals(0.5f, fq.getMinSimilarity(), 0.1f); + assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertParseException("term~1.1"); // value > 1, throws exception @@ -481,9 +481,9 @@ assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); // Fuzzy queries: - assertWildcardQueryEquals("Term~", "term~0.5"); - assertWildcardQueryEquals("Term~", true, "term~0.5"); - assertWildcardQueryEquals("Term~", false, "Term~0.5"); + assertWildcardQueryEquals("Term~", "term~2.0"); + assertWildcardQueryEquals("Term~", true, "term~2.0"); + assertWildcardQueryEquals("Term~", false, "Term~2.0"); // Range queries: assertWildcardQueryEquals("[A TO C]", "[a TO c]"); assertWildcardQueryEquals("[A TO C]", true, "[a TO c]"); @@ -761,10 +761,10 @@ assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); - assertQueryEquals("a:b\\-c~", a, "a:b-c~0.5"); - assertQueryEquals("a:b\\+c~", a, "a:b+c~0.5"); - assertQueryEquals("a:b\\:c~", a, "a:b:c~0.5"); - assertQueryEquals("a:b\\\\c~", a, "a:b\\c~0.5"); + assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); + assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); + assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); + assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); Index: lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 1002153) +++ lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy) @@ -27,9 +27,12 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Ignore; /** * Tests {@link FuzzyQuery}. @@ -202,58 +205,58 @@ FuzzyQuery query; // not similar enough: - query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // edit distance to "aaaaaaa" = 3, this matches because the string is longer than // in testDefaultFuzziness so a bigger difference is allowed: - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); // now with prefix - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // no match, more than half of the characters is wrong: - query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // now with prefix - query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // "student" and "stellent" are indeed similar to "segment" by default: - query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "student"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); // now with prefix - query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "student"), 0.5f, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "student"), 0.5f, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); @@ -328,7 +331,7 @@ IndexSearcher searcher = new IndexSearcher(reader); writer.close(); - FuzzyQuery query = new FuzzyQuery(new Term("field", "Lucene")); + FuzzyQuery query = new FuzzyQuery(new Term("field", "lucene")); query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite()); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); @@ -378,6 +381,54 @@ r.close(); index.close(); } + + public void testDistanceAsEditsParsing() throws Exception { + QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer()); + FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2"); + assertEquals(2f, q.getMinSimilarity(), 0.0001f); + } + + public void testDistanceAsEditsSearching() throws Exception { + Directory index = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, index); + addDoc("foobar", w); + addDoc("test", w); + addDoc("working", w); + IndexReader reader = w.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + w.close(); + QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer()); + + FuzzyQuery q = (FuzzyQuery) qp.parse("fouba~2"); + ScoreDoc[] hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); + + q = (FuzzyQuery) qp.parse("foubara~2"); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); + + q = (FuzzyQuery) qp.parse("t~3"); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + + q = new FuzzyQuery(new Term("field", "a"), 4f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + + q = new FuzzyQuery(new Term("field", "a"), 6f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(2, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); + + searcher.close(); + reader.close(); + index.close(); + } private void addDoc(String text, RandomIndexWriter writer) throws IOException { Document doc = new Document(); Index: lucene/src/java/org/apache/lucene/queryParser/Token.java =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/Token.java (revision 1002153) +++ lucene/src/java/org/apache/lucene/queryParser/Token.java (working copy) @@ -121,4 +121,4 @@ } } -/* JavaCC - OriginalChecksum=37b1923f964a5a434f5ea3d6952ff200 (do not edit this line) */ +/* JavaCC - OriginalChecksum=c147cc166a7cf8812c7c39bc8c5eb868 (do not edit this line) */ Index: lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java (revision 1002153) +++ lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java (working copy) @@ -138,4 +138,4 @@ this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=334e679cf1a88b3070bb8e3d80ee3f5e (do not edit this line) */ +/* JavaCC - OriginalChecksum=1c94e13236c7e0121e49427992341ee3 (do not edit this line) */ Index: lucene/src/java/org/apache/lucene/queryParser/QueryParser.java =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/QueryParser.java (revision 1002153) +++ lucene/src/java/org/apache/lucene/queryParser/QueryParser.java (working copy) @@ -1446,8 +1446,10 @@ try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ {if (true) throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");} + } else if (fms >= 1.0f && fms != (int) fms) { + {if (true) throw new ParseException("Fractional edit distances are not allowed!");} } q = getFuzzyQuery(field, termImage,fms); } else { Index: lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 1002153) +++ lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy) @@ -1412,8 +1412,10 @@ try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); + } else if (fms >= 1.0f && fms != (int) fms) { + throw new ParseException("Fractional edit distances are not allowed!"); } q = getFuzzyQuery(field, termImage,fms); } else { Index: lucene/src/java/org/apache/lucene/queryParser/CharStream.java =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/CharStream.java (revision 1002153) +++ lucene/src/java/org/apache/lucene/queryParser/CharStream.java (working copy) @@ -109,4 +109,4 @@ void Done(); } -/* JavaCC - OriginalChecksum=a83909a2403f969f94d18375f9f143e4 (do not edit this line) */ +/* JavaCC - OriginalChecksum=32a89423891f765dde472f7ef0e3ef7b (do not edit this line) */ Index: lucene/src/java/org/apache/lucene/queryParser/ParseException.java =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/ParseException.java (revision 1002153) +++ lucene/src/java/org/apache/lucene/queryParser/ParseException.java (working copy) @@ -195,4 +195,4 @@ } } -/* JavaCC - OriginalChecksum=c63b396885c4ff44d7aa48d3feae60cd (do not edit this line) */ +/* JavaCC - OriginalChecksum=c7631a240f7446940695eac31d9483ca (do not edit this line) */ Index: lucene/src/java/org/apache/lucene/search/FuzzyQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyQuery.java (revision 1002153) +++ lucene/src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) @@ -21,16 +21,13 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.LevenshteinAutomata; import java.io.IOException; /** Implements the fuzzy search query. The similarity measurement * is based on the Levenshtein (edit distance) algorithm. * - *
Warning: this query is not very scalable with its default prefix - * length of 0 - in this case, *every* term will be enumerated and - * cause an edit score calculation. - * *
This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
* as default. So terms will be collected and scored according to their
* edit distance. Only the top terms are used for building the {@link BooleanQuery}.
@@ -38,9 +35,9 @@
*/
public class FuzzyQuery extends MultiTermQuery {
- public final static float defaultMinSimilarity = 0.5f;
+ public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0;
- public final static int defaultMaxExpansions = Integer.MAX_VALUE;
+ public final static int defaultMaxExpansions = 50;
private float minimumSimilarity;
private int prefixLength;
@@ -60,6 +57,12 @@
* minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
+ *
+ * Alternatively, if minimumSimilarity is >= 1f, it is interpreted
+ * as a pure Levenshtein edit distance. For example, a value of 2f
+ * will match all terms within an edit distance of 2 from the
+ * query term. Edit distances specified in this way may not be fractional.
+ *
* @param prefixLength length of common (non-fuzzy) prefix
* @param maxExpansions the maximum number of terms to match. If this number is
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
@@ -72,9 +75,9 @@
super(term.field());
this.term = term;
- if (minimumSimilarity >= 1.0f)
- throw new IllegalArgumentException("minimumSimilarity >= 1");
- else if (minimumSimilarity < 0.0f)
+ if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
+ throw new IllegalArgumentException("fractional edit distances are not allowed");
+ if (minimumSimilarity < 0.0f)
throw new IllegalArgumentException("minimumSimilarity < 0");
if (prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
@@ -84,7 +87,8 @@
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
String text = term.text();
- if (text.codePointCount(0, text.length()) > 1.0f / (1.0f - minimumSimilarity)) {
+ int len = text.codePointCount(0, text.length());
+ if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
this.termLongEnough = true;
}
Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 1002153)
+++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy)
@@ -58,7 +58,8 @@
private final int termLength;
- private int maxEdits;
+ int maxEdits;
+ private final boolean raw;
private List