From a43f8aad317154c90e5a393f39333d8e5fabf9da Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Tue, 21 Jan 2014 21:47:33 -0700 Subject: [PATCH] Add support for fuzziness to SimpleQueryParser --- .../queryparser/simple/SimpleQueryParser.java | 134 ++++++++++++++++++--- .../queryparser/simple/TestSimpleQueryParser.java | 88 +++++++++++++- 2 files changed, 203 insertions(+), 19 deletions(-) diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java index 908c1ba..f3d3b10 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java @@ -21,10 +21,12 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.QueryBuilder; +import org.apache.lucene.util.automaton.LevenshteinAutomata; import java.util.Collections; import java.util.Map; @@ -111,6 +113,11 @@ public class SimpleQueryParser extends QueryBuilder { public static final int ESCAPE_OPERATOR = 1<<6; /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */ public static final int WHITESPACE_OPERATOR = 1<<7; + /** Enables {@code FUZZINESS} operators: (~) on single terms */ + public static final int FUZZINESS_OPERATOR = 1<<8; + /** Enables {@code SLOP} operators: (~) on phrases */ + public static final int SLOP_OPERATOR = 1<<9; + private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD; @@ -266,6 +273,7 @@ public class SimpleQueryParser extends QueryBuilder { int start = ++state.index; int copied = 0; boolean escaped = false; + boolean hasSlop = false; while (state.index < state.length) { if (!escaped) { @@ -279,10 +287,23 @@ public class SimpleQueryParser extends QueryBuilder { continue; } else if (state.data[state.index] == '"') { - // this should be the end of the phrase - // all characters found will used for - // creating the phrase query - break; + // if there are still characters after the closing ", check for a + // tilde + if (state.length > (state.index + 1) && + state.data[state.index+1] == '~' && + (flags & SLOP_OPERATOR) != 0) { + state.index++; + // check for characters after the tilde + if (state.length > (state.index + 1)) { + hasSlop = true; + } + break; + } else { + // this should be the end of the phrase + // all characters found will used for + // creating the phrase query + break; + } } } @@ -305,7 +326,12 @@ public class SimpleQueryParser extends QueryBuilder { // a complete phrase has been found and is parsed through // through the analyzer from the given field String phrase = new String(state.buffer, 0, copied); - Query branch = newPhraseQuery(phrase); + Query branch; + if (hasSlop) { + branch = newPhraseQuery(phrase, parseFuzziness(state)); + } else { + branch = newPhraseQuery(phrase, 0); + } buildQueryTree(state, branch); ++state.index; @@ -316,6 +342,7 @@ public class SimpleQueryParser extends QueryBuilder { int copied = 0; boolean escaped = false; boolean prefix = false; + boolean fuzzy = false; while (state.index < state.length) { if (!escaped) { @@ -329,19 +356,14 @@ public class SimpleQueryParser extends QueryBuilder { ++state.index; continue; - } else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) - || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) - || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) - || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) - || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) - || ((state.data[state.index] == ' ' - || state.data[state.index] == '\t' - || state.data[state.index] == '\n' - || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) { + } else if (tokenFinished(state)) { // this should be the end of the term // all characters found will used for // creating the term query break; + } else if (copied > 0 && state.data[state.index] == '~' && (flags & FUZZINESS_OPERATOR) != 0) { + fuzzy = true; + break; } // wildcard tracks whether or not the last character @@ -358,7 +380,13 @@ public class SimpleQueryParser extends QueryBuilder { if (copied > 0) { final Query branch; - if (prefix) { + if (fuzzy && (flags & FUZZINESS_OPERATOR) != 0) { + String token = new String(state.buffer, 0, copied); + int fuzziness = parseFuzziness(state); + // edit distance has a maximum, limit to the maximum supported + fuzziness = Math.min(fuzziness, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + branch = newFuzzyQuery(token, fuzziness); + } else if (prefix) { // if a term is found with a closing '*' it is considered to be a prefix query // and will have prefix added as an option String token = new String(state.buffer, 0, copied - 1); @@ -421,6 +449,56 @@ public class SimpleQueryParser extends QueryBuilder { } /** + * Helper parsing fuzziness from parsing state + * @return slop/edit distance, 0 in the case of non-parsing slop/edit string + */ + private int parseFuzziness(State state) { + char slopText[] = new char[state.length]; + int slopLength = 0; + + if (state.data[state.index] == '~') { + while (state.index < state.length) { + state.index++; + // it's possible that the ~ was at the end, so check after incrementing + // to make sure we don't go out of bounds + if (state.index < state.length) { + if (tokenFinished(state)) { + break; + } + slopText[slopLength] = state.data[state.index]; + slopLength++; + } + } + int fuzziness = 0; + try { + fuzziness = Integer.parseInt(new String(slopText, 0, slopLength)); + } catch (NumberFormatException e) { + // swallow number format exceptions parsing fuzziness + } + return fuzziness; + } + return 0; + } + + /** + * Helper returning true if the state has reached the end of token. + */ + private boolean tokenFinished(State state) { + if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) + || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) + || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) + || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) + || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) + || ((state.data[state.index] == ' ' + || state.data[state.index] == '\t' + || state.data[state.index] == '\n' + || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) { + return true; + } + return false; + } + + /** * Factory method to generate a standard query (no phrase or prefix operators). */ protected Query newDefaultQuery(String text) { @@ -436,12 +514,32 @@ public class SimpleQueryParser extends QueryBuilder { } /** - * Factory method to generate a phrase query. + * Factory method to generate a fuzzy query. + */ + protected Query newFuzzyQuery(String text, int fuzziness) { + BooleanQuery bq = new BooleanQuery(true); + for (Map.Entry entry : weights.entrySet()) { + Query q; + if (fuzziness <= 0) { + q = createBooleanQuery(entry.getKey(), text, defaultOperator); + } else { + q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness); + } + if (q != null) { + q.setBoost(entry.getValue()); + bq.add(q, BooleanClause.Occur.SHOULD); + } + } + return simplify(bq); + } + + /** + * Factory method to generate a phrase query with slop. */ - protected Query newPhraseQuery(String text) { + protected Query newPhraseQuery(String text, int slop) { BooleanQuery bq = new BooleanQuery(true); for (Map.Entry entry : weights.entrySet()) { - Query q = createPhraseQuery(entry.getKey(), text); + Query q = createPhraseQuery(entry.getKey(), text, slop); if (q != null) { q.setBoost(entry.getValue()); bq.add(q, BooleanClause.Occur.SHOULD); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java index 078defb..ef353a8 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; @@ -34,14 +35,17 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.LevenshteinAutomata; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.AND_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.ESCAPE_OPERATOR; +import static org.apache.lucene.queryparser.simple.SimpleQueryParser.FUZZINESS_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.NOT_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.OR_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PHRASE_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PRECEDENCE_OPERATORS; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PREFIX_OPERATOR; +import static org.apache.lucene.queryparser.simple.SimpleQueryParser.SLOP_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.WHITESPACE_OPERATOR; /** Tests for {@link SimpleQueryParser} */ @@ -58,6 +62,18 @@ public class TestSimpleQueryParser extends LuceneTestCase { return parser.parse(text); } + /** + * helper to parse a query with whitespace+lowercase analyzer across "field", + * with default operator of MUST + */ + private Query parse(String text, int flags) { + Analyzer analyzer = new MockAnalyzer(random()); + SimpleQueryParser parser = new SimpleQueryParser(analyzer, + Collections.singletonMap("field", 1f), flags); + parser.setDefaultOperator(Occur.MUST); + return parser.parse(text); + } + /** test a simple term */ public void testTerm() throws Exception { Query expected = new TermQuery(new Term("field", "foobar")); @@ -65,6 +81,24 @@ public class TestSimpleQueryParser extends LuceneTestCase { assertEquals(expected, parse("foobar")); } + /** test a fuzzy query */ + public void testFuzzy() throws Exception { + Query regular = new TermQuery(new Term("field", "foobar")); + Query expected = new FuzzyQuery(new Term("field", "foobar"), 2); + + assertEquals(expected, parse("foobar~2")); + assertEquals(regular, parse("foobar~")); + assertEquals(regular, parse("foobar~a")); + assertEquals(regular, parse("foobar~1a")); + + BooleanQuery bool = new BooleanQuery(); + FuzzyQuery fuzzy = new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + bool.add(fuzzy, Occur.MUST); + bool.add(new TermQuery(new Term("field", "bar")), Occur.MUST); + + assertEquals(bool, parse("foo~" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1 + " bar")); + } + /** test a simple phrase */ public void testPhrase() throws Exception { PhraseQuery expected = new PhraseQuery(); @@ -74,6 +108,42 @@ public class TestSimpleQueryParser extends LuceneTestCase { assertEquals(expected, parse("\"foo bar\"")); } + /** test a simple phrase with various slop settings */ + public void testPhraseWithSlop() throws Exception { + PhraseQuery expectedWithSlop = new PhraseQuery(); + expectedWithSlop.add(new Term("field", "foo")); + expectedWithSlop.add(new Term("field", "bar")); + expectedWithSlop.setSlop(2); + + assertEquals(expectedWithSlop, parse("\"foo bar\"~2")); + + PhraseQuery expectedWithMultiDigitSlop = new PhraseQuery(); + expectedWithMultiDigitSlop.add(new Term("field", "foo")); + expectedWithMultiDigitSlop.add(new Term("field", "bar")); + expectedWithMultiDigitSlop.setSlop(10); + + assertEquals(expectedWithMultiDigitSlop, parse("\"foo bar\"~10")); + + PhraseQuery expectedNoSlop = new PhraseQuery(); + expectedNoSlop.add(new Term("field", "foo")); + expectedNoSlop.add(new Term("field", "bar")); + + assertEquals("Ignore trailing tilde with no slop", expectedNoSlop, parse("\"foo bar\"~")); + assertEquals("Ignore non-numeric trailing slop", expectedNoSlop, parse("\"foo bar\"~a")); + assertEquals("Ignore non-numeric trailing slop", expectedNoSlop, parse("\"foo bar\"~1a")); + + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("field", "foo")); + pq.add(new Term("field", "bar")); + pq.setSlop(12); + + BooleanQuery expectedBoolean = new BooleanQuery(); + expectedBoolean.add(pq, Occur.MUST); + expectedBoolean.add(new TermQuery(new Term("field", "baz")), Occur.MUST); + + assertEquals(expectedBoolean, parse("\"foo bar\"~12 baz")); + } + /** test a simple prefix */ public void testPrefix() throws Exception { PrefixQuery expected = new PrefixQuery(new Term("field", "foobar")); @@ -533,6 +603,22 @@ public class TestSimpleQueryParser extends LuceneTestCase { assertEquals(expected, parseKeyword("\t\tfoo foo foo", ~WHITESPACE_OPERATOR)); } + public void testDisableFuzziness() { + Query expected = new TermQuery(new Term("field", "foo~1")); + assertEquals(expected, parseKeyword("foo~1", ~FUZZINESS_OPERATOR)); + } + + public void testDisableSlop() { + PhraseQuery expectedPhrase = new PhraseQuery(); + expectedPhrase.add(new Term("field", "foo")); + expectedPhrase.add(new Term("field", "bar")); + + BooleanQuery expected = new BooleanQuery(); + expected.add(expectedPhrase, Occur.MUST); + expected.add(new TermQuery(new Term("field", "~2")), Occur.MUST); + assertEquals(expected, parse("\"foo bar\"~2", ~SLOP_OPERATOR)); + } + // we aren't supposed to barf on any input... public void testRandomQueries() throws Exception { for (int i = 0; i < 1000; i++) { @@ -543,7 +629,7 @@ public class TestSimpleQueryParser extends LuceneTestCase { } public void testRandomQueries2() throws Exception { - char chars[] = new char[] { 'a', '1', '|', '&', ' ', '(', ')', '"', '-' }; + char chars[] = new char[] { 'a', '1', '|', '&', ' ', '(', ')', '"', '-', '~'}; StringBuilder sb = new StringBuilder(); for (int i = 0; i < 1000; i++) { sb.setLength(0); -- 1.8.5.3