From 3f782e2c54073df3fe3c2228f91bbc4d6e748978 Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Tue, 21 Jan 2014 21:47:33 -0700 Subject: [PATCH] Add support for fuzziness to SimpleQueryParser --- .../java/org/apache/lucene/util/QueryBuilder.java | 65 +++++++++++-- .../queryparser/simple/SimpleQueryParser.java | 107 +++++++++++++++++++-- .../queryparser/simple/TestSimpleQueryParser.java | 66 +++++++++++++ 3 files changed, 225 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index aedf14a..45b8c16 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; @@ -86,6 +87,13 @@ public class QueryBuilder { } return createFieldQuery(analyzer, operator, field, queryText, false, 0); } + + public Query createFuzzyBooleanQuery(String field, String queryText, BooleanClause.Occur operator, int fuzziness) { + if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) { + throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed"); + } + return createFieldQuery(analyzer, operator, field, queryText, false, 0, fuzziness); + } /** * Creates a phrase query from the query text. @@ -180,7 +188,7 @@ public class QueryBuilder { /** * Creates a query from the analysis chain. *

- * Expert: this is more useful for subclasses such as queryparsers. + * Expert: this is more useful for subclasses such as queryparsers. * If using this class directly, just use {@link #createBooleanQuery(String, String)} * and {@link #createPhraseQuery(String, String)} * @param analyzer analyzer used for this query @@ -191,6 +199,24 @@ public class QueryBuilder { * @param phraseSlop slop factor for phrase/multiphrase queries */ protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText, boolean quoted, int phraseSlop) { + return createFieldQuery(analyzer, operator, field, queryText, quoted, phraseSlop, 0); + } + + /** + * Creates a query from the analysis chain. + *

+ * Expert: this is more useful for subclasses such as queryparsers. + * If using this class directly, just use {@link #createBooleanQuery(String, String)} + * and {@link #createPhraseQuery(String, String)} + * @param analyzer analyzer used for this query + * @param operator default boolean operator used for this query + * @param field field to create queries against + * @param queryText text to be passed to the analysis chain + * @param quoted true if phrases should be generated when terms occur at more than one position + * @param phraseSlop slop factor for phrase/multiphrase queries + * @param fuzziness edit distance of TermQueries + */ + protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText, boolean quoted, int phraseSlop, int fuzziness) { assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST; // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count @@ -250,7 +276,7 @@ public class QueryBuilder { } catch (IOException e) { // safe to ignore, because we know the number of tokens } - return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); + return newTermOrFuzzyQuery(new Term(field, BytesRef.deepCopyOf(bytes)), fuzziness); } else { if (severalTokensAtSamePosition || (!quoted)) { if (positionCount == 1 || (!quoted)) { @@ -267,8 +293,8 @@ public class QueryBuilder { } catch (IOException e) { // safe to ignore, because we know the number of tokens } - Query currentQuery = newTermQuery( - new Term(field, BytesRef.deepCopyOf(bytes))); + Query currentQuery = newTermOrFuzzyQuery( + new Term(field, BytesRef.deepCopyOf(bytes)), fuzziness); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; @@ -290,12 +316,12 @@ public class QueryBuilder { currentQuery = newBooleanQuery(true); ((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD); } - ((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD); + ((BooleanQuery)currentQuery).add(newTermOrFuzzyQuery(new Term(field, BytesRef.deepCopyOf(bytes)), fuzziness), BooleanClause.Occur.SHOULD); } else { if (currentQuery != null) { q.add(currentQuery, operator); } - currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); + currentQuery = newTermOrFuzzyQuery(new Term(field, BytesRef.deepCopyOf(bytes)), fuzziness); } } q.add(currentQuery, operator); @@ -368,6 +394,19 @@ public class QueryBuilder { } } } + + /** + * Builds either a TermQuery or FuzzyQuery, depending on the fuzziness + * @param term term + * @return new Query instance + */ + private Query newTermOrFuzzyQuery(Term term, int fuzziness) { + if (fuzziness >= 1) { + return newFuzzyQuery(term, fuzziness); + } else { + return newTermQuery(term); + } + } /** * Builds a new BooleanQuery instance. @@ -379,7 +418,7 @@ public class QueryBuilder { protected BooleanQuery newBooleanQuery(boolean disableCoord) { return new BooleanQuery(disableCoord); } - + /** * Builds a new TermQuery instance. *

@@ -390,6 +429,18 @@ public class QueryBuilder { protected Query newTermQuery(Term term) { return new TermQuery(term); } + + /** + * Builds a new FuzzyQuery instance. + *

+ * This is intended for subclasses that wish to customize the generated queries. + * @param term term + * @param fuzziness edit distance + * @return new FuzzyQuery instance + */ + protected Query newFuzzyQuery(Term term, int fuzziness) { + return new FuzzyQuery(term, fuzziness); + } /** * Builds a new PhraseQuery instance. diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java index 908c1ba..97ed1e9 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java @@ -19,12 +19,14 @@ package org.apache.lucene.queryparser.simple; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.flexible.standard.config.NumberDateFormat; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.QueryBuilder; +import org.apache.lucene.util.automaton.LevenshteinAutomata; import java.util.Collections; import java.util.Map; @@ -111,6 +113,8 @@ public class SimpleQueryParser extends QueryBuilder { public static final int ESCAPE_OPERATOR = 1<<6; /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */ public static final int WHITESPACE_OPERATOR = 1<<7; + /** Enables {@code FUZZINESS} operators: (~) */ + public static final int FUZZINESS_OPERATOR = 1<<8; private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD; @@ -265,7 +269,10 @@ public class SimpleQueryParser extends QueryBuilder { assert (flags & PHRASE_OPERATOR) != 0; int start = ++state.index; int copied = 0; + int slopStart = 0; + int slopEnd; boolean escaped = false; + boolean hasSlop = false; while (state.index < state.length) { if (!escaped) { @@ -279,10 +286,24 @@ public class SimpleQueryParser extends QueryBuilder { continue; } else if (state.data[state.index] == '"') { - // this should be the end of the phrase - // all characters found will used for - // creating the phrase query - break; + // if there are still characters after the closing ", check for a + // tilde + if (state.length > (state.index + 1) && + state.data[state.index+1] == '~' && + (flags & FUZZINESS_OPERATOR) != 0) { + state.index++; + // check for characters after the tilde + if (state.length > (state.index + 1)) { + slopStart = state.index + 1; + hasSlop = true; + } + break; + } else { + // this should be the end of the phrase + // all characters found will used for + // creating the phrase query + break; + } } } @@ -305,7 +326,23 @@ public class SimpleQueryParser extends QueryBuilder { // a complete phrase has been found and is parsed through // through the analyzer from the given field String phrase = new String(state.buffer, 0, copied); - Query branch = newPhraseQuery(phrase); + Query branch; + if (hasSlop) { + slopEnd = state.length; + String slopString = new String(state.data, slopStart, slopEnd - slopStart); + // Make sure to increment state.index so the slop doesn't get + // perceived as an additional field + state.index += slopEnd - slopStart; + int slop = 0; + try { + slop = Integer.parseInt(slopString); + } catch (NumberFormatException e) { + // swallow number format exceptions parsing fuzziness + } + branch = newPhraseQuery(phrase, slop); + } else { + branch = newPhraseQuery(phrase); + } buildQueryTree(state, branch); ++state.index; @@ -316,6 +353,10 @@ public class SimpleQueryParser extends QueryBuilder { int copied = 0; boolean escaped = false; boolean prefix = false; + boolean fuzzy = false; + int editDistance = 0; + int editStart = 0; + int editEnd = Integer.MAX_VALUE; while (state.index < state.length) { if (!escaped) { @@ -341,9 +382,13 @@ public class SimpleQueryParser extends QueryBuilder { // this should be the end of the term // all characters found will used for // creating the term query + editEnd = state.index; break; + } else if (state.data[state.index] == '~' && (flags & FUZZINESS_OPERATOR) != 0) { + editStart = state.index + 1; } + fuzzy = fuzzy || (copied > 0 && state.data[state.index] == '~'); // wildcard tracks whether or not the last character // was a '*' operator that hasn't been escaped // there must be at least one valid character before @@ -358,7 +403,27 @@ public class SimpleQueryParser extends QueryBuilder { if (copied > 0) { final Query branch; - if (prefix) { + if (fuzzy && (flags & FUZZINESS_OPERATOR) != 0) { + editEnd = Math.min(state.length, editEnd); + int editStringSize = editEnd - editStart; + // ignore an edit string larger than the copied bytes, fall back to a + // regular prefix query + if (editStringSize >= copied) { + String token = new String(state.buffer, 0, copied - 1); + branch = newPrefixQuery(token); + } else { + String token = new String(state.buffer, 0, copied - (1 + editStringSize)); + String fuzziness = new String(state.buffer, editStart, editStringSize); + try { + editDistance = Integer.parseInt(fuzziness); + // edit distance has a maximum, limit to the maximum supported + editDistance = Math.min(editDistance, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } catch (NumberFormatException e) { + // swallow number format exceptions parsing fuzziness + } + branch = newFuzzyQuery(token, editDistance); + } + } else if (prefix) { // if a term is found with a closing '*' it is considered to be a prefix query // and will have prefix added as an option String token = new String(state.buffer, 0, copied - 1); @@ -436,6 +501,21 @@ public class SimpleQueryParser extends QueryBuilder { } /** + * Factory method to generate a fuzzy query. + */ + protected Query newFuzzyQuery(String text, int fuzziness) { + BooleanQuery bq = new BooleanQuery(true); + for (Map.Entry entry : weights.entrySet()) { + Query q = createFuzzyBooleanQuery(entry.getKey(), text, defaultOperator, fuzziness); + if (q != null) { + q.setBoost(entry.getValue()); + bq.add(q, BooleanClause.Occur.SHOULD); + } + } + return simplify(bq); + } + + /** * Factory method to generate a phrase query. */ protected Query newPhraseQuery(String text) { @@ -451,6 +531,21 @@ public class SimpleQueryParser extends QueryBuilder { } /** + * Factory method to generate a phrase query with slop. + */ + protected Query newPhraseQuery(String text, int slop) { + BooleanQuery bq = new BooleanQuery(true); + for (Map.Entry entry : weights.entrySet()) { + Query q = createPhraseQuery(entry.getKey(), text, slop); + if (q != null) { + q.setBoost(entry.getValue()); + bq.add(q, BooleanClause.Occur.SHOULD); + } + } + return simplify(bq); + } + + /** * Factory method to generate a prefix query. */ protected Query newPrefixQuery(String text) { diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java index 078defb..817bf6a 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/simple/TestSimpleQueryParser.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; @@ -37,6 +38,7 @@ import org.apache.lucene.util._TestUtil; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.AND_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.ESCAPE_OPERATOR; +import static org.apache.lucene.queryparser.simple.SimpleQueryParser.FUZZINESS_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.NOT_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.OR_OPERATOR; import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PHRASE_OPERATOR; @@ -58,6 +60,18 @@ public class TestSimpleQueryParser extends LuceneTestCase { return parser.parse(text); } + /** + * helper to parse a query with whitespace+lowercase analyzer across "field", + * with default operator of MUST + */ + private Query parse(String text, int flags) { + Analyzer analyzer = new MockAnalyzer(random()); + SimpleQueryParser parser = new SimpleQueryParser(analyzer, + Collections.singletonMap("field", 1f), flags); + parser.setDefaultOperator(Occur.MUST); + return parser.parse(text); + } + /** test a simple term */ public void testTerm() throws Exception { Query expected = new TermQuery(new Term("field", "foobar")); @@ -65,6 +79,17 @@ public class TestSimpleQueryParser extends LuceneTestCase { assertEquals(expected, parse("foobar")); } + /** test a fuzzy query */ + public void testFuzzy() throws Exception { + Query regular = new TermQuery(new Term("field", "foobar")); + Query expected = new FuzzyQuery(new Term("field", "foobar"), 2); + + assertEquals(expected, parse("foobar~2")); + assertEquals(regular, parse("foobar~")); + assertEquals(regular, parse("foobar~a")); + assertEquals(regular, parse("foobar~1a")); + } + /** test a simple phrase */ public void testPhrase() throws Exception { PhraseQuery expected = new PhraseQuery(); @@ -74,6 +99,31 @@ public class TestSimpleQueryParser extends LuceneTestCase { assertEquals(expected, parse("\"foo bar\"")); } + /** test a simple phrase with various slop settings */ + public void testPhraseWithSlop() throws Exception { + PhraseQuery expectedWithSlop = new PhraseQuery(); + expectedWithSlop.add(new Term("field", "foo")); + expectedWithSlop.add(new Term("field", "bar")); + expectedWithSlop.setSlop(2); + + assertEquals(expectedWithSlop, parse("\"foo bar\"~2")); + + PhraseQuery expectedWithMultiDigitSlop = new PhraseQuery(); + expectedWithMultiDigitSlop.add(new Term("field", "foo")); + expectedWithMultiDigitSlop.add(new Term("field", "bar")); + expectedWithMultiDigitSlop.setSlop(10); + + assertEquals(expectedWithMultiDigitSlop, parse("\"foo bar\"~10")); + + PhraseQuery expectedNoSlop = new PhraseQuery(); + expectedNoSlop.add(new Term("field", "foo")); + expectedNoSlop.add(new Term("field", "bar")); + + assertEquals("Ignore trailing tilde with no slop", expectedNoSlop, parse("\"foo bar\"~")); + assertEquals("Ignore non-numeric trailing slop", expectedNoSlop, parse("\"foo bar\"~a")); + assertEquals("Ignore non-numeric trailing slop", expectedNoSlop, parse("\"foo bar\"~1a")); + } + /** test a simple prefix */ public void testPrefix() throws Exception { PrefixQuery expected = new PrefixQuery(new Term("field", "foobar")); @@ -533,6 +583,22 @@ public class TestSimpleQueryParser extends LuceneTestCase { assertEquals(expected, parseKeyword("\t\tfoo foo foo", ~WHITESPACE_OPERATOR)); } + public void testDisableFuzziness() { + Query expected = new TermQuery(new Term("field", "foo~1")); + assertEquals(expected, parseKeyword("foo~1", ~FUZZINESS_OPERATOR)); + } + + public void testDisableSlop() { + PhraseQuery expectedPhrase = new PhraseQuery(); + expectedPhrase.add(new Term("field", "foo")); + expectedPhrase.add(new Term("field", "bar")); + + BooleanQuery expected = new BooleanQuery(); + expected.add(expectedPhrase, Occur.MUST); + expected.add(new TermQuery(new Term("field", "~2")), Occur.MUST); + assertEquals(expected, parse("\"foo bar\"~2", ~FUZZINESS_OPERATOR)); + } + // we aren't supposed to barf on any input... public void testRandomQueries() throws Exception { for (int i = 0; i < 1000; i++) { -- 1.8.5.3