Index: lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java =================================================================== --- lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java (revision 1490012) +++ lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java (working copy) @@ -194,6 +194,30 @@ directory.close(); } + public void testFuzzinessLong2() throws Exception { + //Lucene-5033 + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("abcdef", writer); + addDoc("segment", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + SlowFuzzyQuery query; + + query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 3f, 0); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 4f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + reader.close(); + directory.close(); + } + public void testFuzzinessLong() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); Index: lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java =================================================================== --- lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java (revision 1490012) +++ lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java (working copy) @@ -31,9 +31,12 @@ import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; -/** Classic fuzzy TermsEnum for enumerating all terms that are similar +/** Potentially slow fuzzy TermsEnum for enumerating all terms that are similar * to the specified filter term. - * + *
If the minSimilarity or maxEdits is greater than the Automaton's + * allowable range, this backs off to the classic (brute force) + * fuzzy terms enum method by calling FuzzyTermsEnum's getAutomatonEnum. + *
*Term enumerations are always ordered by * {@link #getComparator}. Each term in the enumeration is * greater than all that precede it.
@@ -103,15 +106,27 @@ private final IntsRef utf32 = new IntsRef(20); /** - * The termCompare method in FuzzyTermEnum uses Levenshtein distance to + *The termCompare method in FuzzyTermEnum uses Levenshtein distance to * calculate the distance between the given term and the comparing term. + *
+ *If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison. + * Otherwise, this method uses the following logic to calculate similarity. + *
+ * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen))); + *+ * where distance is the Levenshtein distance for the two words. + * + * */ @Override protected final AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixBytesRef)) { UnicodeUtil.UTF8toUTF32(term, utf32); - final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); - if (similarity > minSimilarity) { + final int distance = calcDistance(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); + final float similarity = calcSimilarity(distance, (utf32.length - realPrefixLength), term.length); + + if ((raw && distance <= maxEdits) || + (raw == false && similarity > minSimilarity)) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); return AcceptStatus.YES; } else return AcceptStatus.NO; @@ -125,52 +140,34 @@ ******************************/ /** - *
Similarity returns a number that is 1.0f or less (including negative numbers) - * based on how similar the Term is compared to a target term. It returns - * exactly 0.0f when - *
- * editDistance > maximumEditDistance- * Otherwise it returns: - *
- * 1 - (editDistance / length)- * where length is the length of the shortest term (text or target) including a - * prefix that are identical and editDistance is the Levenshtein distance for - * the two words. - * + *
calcDistance returns the Levenshtein distance between the query term + * and the target term.
+ * *Embedded within this algorithm is a fail-fast Levenshtein distance * algorithm. The fail-fast algorithm differs from the standard Levenshtein * distance algorithm in that it is aborted if it is discovered that the * minimum distance between the words is greater than some threshold. - * - *
To calculate the maximum distance threshold we use the following formula: - *
- * (1 - minimumSimilarity) * length- * where length is the shortest term including any prefix that is not part of the - * similarity comparison. This formula was derived by solving for what maximum value - * of distance returns false for the following statements: - *
- * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen))); - * return (similarity > minimumSimilarity);- * where distance is the Levenshtein distance for the two words. - * + *
Levenshtein distance (also known as edit distance) is a measure of similarity * between two strings where the distance is measured as the number of character * deletions, insertions or substitutions required to transform one string to * the other string. * @param target the target word or phrase - * @return the similarity, 0.0 or less indicates that it matches less than the required - * threshold and 1.0 indicates that the text and target are identical + * @param offset the offset at which to start the comparison + * @param length the length of what's left of the string to compare + * @return the number of edits or Integer.MAX_VALUE if the edit distance is + * greater than maxDistance. */ - private final float similarity(final int[] target, int offset, int length) { + private final int calcDistance(final int[] target, int offset, int length) { final int m = length; final int n = text.length; if (n == 0) { //we don't have anything to compare. That means if we just add //the letters for m we get the new word - return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength); + return m; } if (m == 0) { - return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength); + return n; } final int maxDistance = calculateMaxDistance(m); @@ -183,7 +180,7 @@ //which is 8-3 or more precisely Math.abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. - return Float.NEGATIVE_INFINITY; + return Integer.MAX_VALUE; } // init matrix d @@ -214,7 +211,7 @@ if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater //the closest the target can be to the text is just too far away. //this target is leaving the party early. - return Float.NEGATIVE_INFINITY; + return Integer.MAX_VALUE; } // copy current distance counts to 'previous row' distance counts: swap p and d @@ -226,14 +223,18 @@ // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts - // this will return less than 0.0 when the edit distance is - // greater than the number of characters in the shorter word. - // but this was the formula that was previously used in FuzzyTermEnum, - // so it has not been changed (even though minimumSimilarity must be - // greater than 0.0) - return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m))); + return p[n]; } + private float calcSimilarity(int edits, int m, int n){ + // this will return less than 0.0 when the edit distance is + // greater than the number of characters in the shorter word. + // but this was the formula that was previously used in FuzzyTermEnum, + // so it has not been changed (even though minimumSimilarity must be + // greater than 0.0) + return 1.0f - ((float)edits / (float) (realPrefixLength + Math.min(n, m))); + } + /** * The max Distance is the maximum Levenshtein distance for the text * compared to some other value that results in score that is