Index: lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java =================================================================== --- lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java (revision 1490012) +++ lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java (working copy) @@ -43,6 +43,9 @@ public class TestSlowFuzzyQuery extends LuceneTestCase { public void testFuzziness() throws Exception { + //every test with SlowFuzzyQuery.defaultMinSimilarity + //is exercising the Automaton, not the brute force linear method + Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("aaaaa", writer); @@ -194,6 +197,30 @@ directory.close(); } + public void testFuzzinessLong2() throws Exception { + //Lucene-5033 + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("abcdef", writer); + addDoc("segment", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + SlowFuzzyQuery query; + + query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 3f, 0); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 4f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + reader.close(); + directory.close(); + } + public void testFuzzinessLong() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); @@ -440,25 +467,21 @@ assertEquals(1, hits.length); assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); - // TODO: cannot really be supported given the legacy scoring - // system which scores negative, if the distance > min term len, - // so such matches were always impossible with lucene 3.x, etc - // - //q = new SlowFuzzyQuery(new Term("field", "t"), 3); - //hits = searcher.search(q, 10).scoreDocs; - //assertEquals(1, hits.length); - //assertEquals("test", searcher.doc(hits[0].doc).get("field")); + q = new SlowFuzzyQuery(new Term("field", "t"), 3); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); - // q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50); - // hits = searcher.search(q, 10).scoreDocs; - // assertEquals(1, hits.length); - // assertEquals("test", searcher.doc(hits[0].doc).get("field")); + q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); - // q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50); - // hits = searcher.search(q, 10).scoreDocs; - // assertEquals(2, hits.length); - // assertEquals("test", searcher.doc(hits[0].doc).get("field")); - // assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); + q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(2, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); reader.close(); index.close(); Index: lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java =================================================================== --- lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java (revision 1490012) +++ lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java (working copy) @@ -31,9 +31,12 @@ import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; -/** Classic fuzzy TermsEnum for enumerating all terms that are similar +/** Potentially slow fuzzy TermsEnum for enumerating all terms that are similar * to the specified filter term. - * + *

If the minSimilarity or maxEdits is greater than the Automaton's + * allowable range, this backs off to the classic (brute force) + * fuzzy terms enum method by calling FuzzyTermsEnum's getAutomatonEnum. + *

*

Term enumerations are always ordered by * {@link #getComparator}. Each term in the enumeration is * greater than all that precede it.

@@ -103,18 +106,43 @@ private final IntsRef utf32 = new IntsRef(20); /** - * The termCompare method in FuzzyTermEnum uses Levenshtein distance to + *

The termCompare method in FuzzyTermEnum uses Levenshtein distance to * calculate the distance between the given term and the comparing term. + *

+ *

If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison. + * Otherwise, this method uses the following logic to calculate similarity. + *

+     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+     *   
+ * where distance is the Levenshtein distance for the two words. + *

+ * */ @Override protected final AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixBytesRef)) { UnicodeUtil.UTF8toUTF32(term, utf32); - final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); - if (similarity > minSimilarity) { + final int distance = calcDistance(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); + + //Integer.MIN_VALUE is the sentinel that Levenshtein stopped early + if (distance == Integer.MIN_VALUE){ + return AcceptStatus.NO; + } + //no need to calc similarity, if raw is true and distance > maxEdits + if (raw == true && distance > maxEdits){ + return AcceptStatus.NO; + } + final float similarity = calcSimilarity(distance, (utf32.length - realPrefixLength), text.length); + + //if raw is true, then distance must also be <= maxEdits by now + //given the previous if statement + if (raw == true || + (raw == false && similarity > minSimilarity)) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); return AcceptStatus.YES; - } else return AcceptStatus.NO; + } else { + return AcceptStatus.NO; + } } else { return AcceptStatus.END; } @@ -125,52 +153,34 @@ ******************************/ /** - *

Similarity returns a number that is 1.0f or less (including negative numbers) - * based on how similar the Term is compared to a target term. It returns - * exactly 0.0f when - *

-     *    editDistance > maximumEditDistance
- * Otherwise it returns: - *
-     *    1 - (editDistance / length)
- * where length is the length of the shortest term (text or target) including a - * prefix that are identical and editDistance is the Levenshtein distance for - * the two words.

- * + *

calcDistance returns the Levenshtein distance between the query term + * and the target term.

+ * *

Embedded within this algorithm is a fail-fast Levenshtein distance * algorithm. The fail-fast algorithm differs from the standard Levenshtein * distance algorithm in that it is aborted if it is discovered that the * minimum distance between the words is greater than some threshold. - * - *

To calculate the maximum distance threshold we use the following formula: - *

-     *     (1 - minimumSimilarity) * length
- * where length is the shortest term including any prefix that is not part of the - * similarity comparison. This formula was derived by solving for what maximum value - * of distance returns false for the following statements: - *
-     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
-     *   return (similarity > minimumSimilarity);
- * where distance is the Levenshtein distance for the two words. - *

+ *

Levenshtein distance (also known as edit distance) is a measure of similarity * between two strings where the distance is measured as the number of character * deletions, insertions or substitutions required to transform one string to * the other string. * @param target the target word or phrase - * @return the similarity, 0.0 or less indicates that it matches less than the required - * threshold and 1.0 indicates that the text and target are identical + * @param offset the offset at which to start the comparison + * @param length the length of what's left of the string to compare + * @return the number of edits or Integer.MIN_VALUE if the edit distance is + * greater than maxDistance. */ - private final float similarity(final int[] target, int offset, int length) { + private final int calcDistance(final int[] target, int offset, int length) { final int m = length; final int n = text.length; if (n == 0) { //we don't have anything to compare. That means if we just add //the letters for m we get the new word - return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength); + return m; } if (m == 0) { - return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength); + return n; } final int maxDistance = calculateMaxDistance(m); @@ -183,7 +193,7 @@ //which is 8-3 or more precisely Math.abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. - return Float.NEGATIVE_INFINITY; + return Integer.MIN_VALUE; } // init matrix d @@ -214,7 +224,7 @@ if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater //the closest the target can be to the text is just too far away. //this target is leaving the party early. - return Float.NEGATIVE_INFINITY; + return Integer.MIN_VALUE; } // copy current distance counts to 'previous row' distance counts: swap p and d @@ -226,14 +236,19 @@ // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts - // this will return less than 0.0 when the edit distance is - // greater than the number of characters in the shorter word. - // but this was the formula that was previously used in FuzzyTermEnum, - // so it has not been changed (even though minimumSimilarity must be - // greater than 0.0) - return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m))); + return p[n]; } + private float calcSimilarity(int edits, int m, int n){ + // this will return less than 0.0 when the edit distance is + // greater than the number of characters in the shorter word. + // but this was the formula that was previously used in FuzzyTermEnum, + // so it has not been changed (even though minimumSimilarity must be + // greater than 0.0) + + return 1.0f - ((float)edits / (float) (realPrefixLength + Math.min(n, m))); + } + /** * The max Distance is the maximum Levenshtein distance for the text * compared to some other value that results in score that is