diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java index c224682..3e26965 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java @@ -22,7 +22,6 @@ import java.util.ArrayList; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -137,40 +136,4 @@ public class TestStopFilter extends BaseTokenStreamTestCase { System.out.println(s); } } - - // stupid filter that inserts synonym of 'hte' for 'the' - private class MockSynonymFilter extends TokenFilter { - State bufferedState; - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - - MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (bufferedState != null) { - restoreState(bufferedState); - posIncAtt.setPositionIncrement(0); - termAtt.setEmpty().append("hte"); - bufferedState = null; - return true; - } else if (input.incrementToken()) { - if (termAtt.toString().equals("the")) { - bufferedState = captureState(); - } - return true; - } else { - return false; - } - } - - @Override - public void reset() throws IOException { - super.reset(); - bufferedState = null; - } - } - } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java index 205fbab..d3019e3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockSynonymFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -121,7 +122,7 @@ public class TestQueryBuilder extends LuceneTestCase { assertNull(builder.createBooleanQuery("field", "")); } - /** adds synonym of "dog" for "dogs". */ + /** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ static class MockSynonymAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { @@ -130,37 +131,6 @@ public class TestQueryBuilder extends LuceneTestCase { } } - /** - * adds synonym of "dog" for "dogs". - */ - protected static class MockSynonymFilter extends TokenFilter { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - boolean addSynonym = false; - - public MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (addSynonym) { // inject our synonym - clearAttributes(); - termAtt.setEmpty().append("dog"); - posIncAtt.setPositionIncrement(0); - addSynonym = false; - return true; - } - - if (input.incrementToken()) { - addSynonym = termAtt.toString().equals("dogs"); - return true; - } else { - return false; - } - } - } - /** simple synonyms test */ public void testSynonyms() throws Exception { SynonymQuery expected = new SynonymQuery(new Term("field", "dogs"), new Term("field", "dog")); @@ -180,6 +150,15 @@ public class TestQueryBuilder extends LuceneTestCase { assertEquals(expectedBuilder.build(), builder.createPhraseQuery("field", "old dogs")); } + /** forms multiphrase query */ + public void testMultiWordSynonymsPhrase() throws Exception { + MultiPhraseQuery.Builder expectedBuilder = new MultiPhraseQuery.Builder(); + expectedBuilder.add(new Term[] { new Term("field", "guinea"), new Term("field", "cavy") }); + expectedBuilder.add(new Term("field", "pig")); + QueryBuilder queryBuilder = new QueryBuilder(new MockSynonymAnalyzer()); + assertEquals(expectedBuilder.build(), queryBuilder.createPhraseQuery("field", "guinea pig")); + } + protected static class SimpleCJKTokenizer extends Tokenizer { private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java index b9963ec..69a7559 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java @@ -27,6 +27,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; /** * A QueryParser which constructs queries to search multiple fields. @@ -148,18 +149,54 @@ public class MultiFieldQueryParser extends QueryParser protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { if (field == null) { List clauses = new ArrayList<>(); + Query[] fieldQueries = new Query[fields.length]; + int maxTerms = 0; for (int i = 0; i < fields.length; i++) { Query q = super.getFieldQuery(fields[i], queryText, quoted); if (q != null) { - //If the user passes a map of boosts - if (boosts != null) { - //Get the boost from the map and apply them - Float boost = boosts.get(fields[i]); - if (boost != null) { - q = new BoostQuery(q, boost.floatValue()); + if (q instanceof TermQuery) { + maxTerms = Math.max(1, maxTerms); + } else if (q instanceof BooleanQuery) { + maxTerms = Math.max(maxTerms, ((BooleanQuery)q).clauses().size()); + } + fieldQueries[i] = q; + } + } + for (int termNum = 0; termNum < maxTerms; termNum++) { + List termClauses = new ArrayList<>(); + for (int i = 0; i < fields.length; i++) { + if (fieldQueries[i] != null) { + Query q = null; + if (fieldQueries[i] instanceof BooleanQuery) { + List nestedClauses = ((BooleanQuery)fieldQueries[i]).clauses(); + if (termNum < nestedClauses.size()) { + q = nestedClauses.get(termNum).getQuery(); + } + } else if (termNum == 0) { // e.g. TermQuery-s + q = fieldQueries[i]; + } + if (q != null) { + if (boosts != null) { + //Get the boost from the map and apply them + Float boost = boosts.get(fields[i]); + if (boost != null) { + q = new BoostQuery(q, boost); + } + } + termClauses.add(q); } } - clauses.add(q); + } + if (maxTerms > 1) { + if (termClauses.size() > 0) { + BooleanQuery.Builder builder = newBooleanQuery(); + for (Query termClause : termClauses) { + builder.add(termClause, BooleanClause.Occur.SHOULD); + } + clauses.add(builder.build()); + } + } else { + clauses.addAll(termClauses); } } if (clauses.size() == 0) // happens for stopwords diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj index 9bf154d..72766aa 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj @@ -19,6 +19,7 @@ options { STATIC=false; JAVA_UNICODE_ESCAPE=true; USER_CHAR_STREAM=true; + LOOKAHEAD=2; } PARSER_BEGIN(QueryParser) @@ -27,8 +28,11 @@ package org.apache.lucene.queryparser.classic; import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; @@ -106,6 +110,9 @@ public class QueryParser extends QueryParserBase { */ static public enum Operator { OR, AND } + /** default split on whitespace behavior */ + public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true; + /** Create a query parser. * @param f the default field for query terms. * @param a used to find terms in the query text. @@ -114,6 +121,28 @@ public class QueryParser extends QueryParserBase { this(new FastCharStream(new StringReader(""))); init(f, a); } + + /** + * @see #setSplitOnWhitespace(boolean) + */ + public boolean getSplitOnWhitespace() { + return splitOnWhitespace; + } + + /** + * Whether query text should be split on whitespace prior to analysis. + * Default is {@value #DEFAULT_SPLIT_ON_WHITESPACE}. + */ + public void setSplitOnWhitespace(boolean splitOnWhitespace) { + this.splitOnWhitespace = splitOnWhitespace; + } + + private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE; + private static Set disallowedPostMultiTerm + = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR)); + private static boolean allowedPostMultiTerm(int tokenKind) { + return disallowedPostMultiTerm.contains(tokenKind) == false; + } } PARSER_END(QueryParser) @@ -123,53 +152,56 @@ PARSER_END(QueryParser) /* ***************** */ <*> TOKEN : { - <#_NUM_CHAR: ["0"-"9"] > -// every character that follows a backslash is considered as an escaped character -| <#_ESCAPED_CHAR: "\\" ~[] > + <#_NUM_CHAR: ["0"-"9"] > +| <#_ESCAPED_CHAR: "\\" ~[] > // every character that follows a backslash is considered as an escaped character | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ] - | <_ESCAPED_CHAR> ) > -| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > -| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > -| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > -} - - SKIP : { - < <_WHITESPACE>> + | <_ESCAPED_CHAR> ) > +| <#_TERM_CHAR: ( <_TERM_START_CHAR> | "-" | "+" ) > +| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > +| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } TOKEN : { - -| -| -| -| -| > -| -| -| -| -| : Boost -| )* "\""> -| (<_TERM_CHAR>)* > -| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > -| (<_TERM_CHAR>)* "*" ) > -| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > -| + +| +| +| +| +| > +| +| +| +| +| : Boost +| )* "\""> +| (<_TERM_CHAR>)* > +| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > +| (<_TERM_CHAR>)* "*" ) > +| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > +| | : Range | : Range } +// Whitespace handling strategy: tokenize whitespace sequences and accept 0 or 1 whitespace +// token at the beginning of every rule, inbetween tokens in rules, and - only once, +// in the TopLevelQuery rule! - just before . Note that no whitespace is allowed +// between the caret and the following boost value. + TOKEN : { + )+ > +} + TOKEN : { -)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT + )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } TOKEN : { - -| : DEFAULT -| : DEFAULT + +| : DEFAULT +| : DEFAULT | -| +| } // * Query ::= ( Clause )* @@ -180,8 +212,9 @@ int Conjunction() : { } { [ - { ret = CONJ_AND; } - | { ret = CONJ_OR; } + [ ] + ( { ret = CONJ_AND; } + | { ret = CONJ_OR; } ) ] { return ret; } } @@ -191,23 +224,21 @@ int Modifiers() : { } { [ - { ret = MOD_REQ; } - | { ret = MOD_NOT; } - | { ret = MOD_NOT; } + [ ] + ( { ret = MOD_REQ; } + | { ret = MOD_NOT; } + | { ret = MOD_NOT; } ) ] { return ret; } } // This makes sure that there is no garbage after the query string -Query TopLevelQuery(String field) : -{ +Query TopLevelQuery(String field) : { Query q; } { - q=Query(field) - { - return q; - } + q=Query(field) [ ] + { return q; } } Query Query(String field) : @@ -217,23 +248,30 @@ Query Query(String field) : int conj, mods; } { - mods=Modifiers() q=Clause(field) - { - addClause(clauses, CONJ_NONE, mods, q); - if (mods == MOD_NONE) - firstQuery=q; - } ( - conj=Conjunction() mods=Modifiers() q=Clause(field) - { addClause(clauses, conj, mods, q); } - )* - { - if (clauses.size() == 1 && firstQuery != null) - return firstQuery; - else { - return getBooleanQuery(clauses); + LOOKAHEAD(3) + firstQuery=MultiTerm(field, clauses) + | mods=Modifiers() q=Clause(field) + { + addClause(clauses, CONJ_NONE, mods, q); + if (mods == MOD_NONE) { + firstQuery = q; + } } + ) + ( + LOOKAHEAD(3) + MultiTerm(field, clauses) + | conj=Conjunction() mods=Modifiers() q=Clause(field) + { addClause(clauses, conj, mods, q); } + )* + { + if (clauses.size() == 1 && firstQuery != null) { + return firstQuery; + } else { + return getBooleanQuery(clauses); } + } } Query Clause(String field) : { @@ -242,22 +280,22 @@ Query Clause(String field) : { } { [ - LOOKAHEAD(2) + LOOKAHEAD(4) ( - fieldToken= {field=discardEscapeChar(fieldToken.image);} - | {field="*";} + [ ] + fieldToken= [ ] { field = discardEscapeChar(fieldToken.image); } + | [ ] { field = "*"; } ) ] - ( - q=Term(field) - | q=Query(field) ( boost=)? - + q=Term(field) + | [ ] q=Query(field) + [ ] + [ [ ] [ ] boost= ] ) - { return handleBoost(q, boost); } + { return handleBoost(q, boost); } } - Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; @@ -269,46 +307,104 @@ Query Term(String field) : { Query q; } { + [ ] ( - ( - term= - | term= { wildcard=true; } - | term= { prefix=true; } - | term= { wildcard=true; } - | term= { regexp=true; } - | term= - | term= { term.image = term.image.substring(0,1); } - ) - [ fuzzySlop= { fuzzy=true; } ] - [ boost= [ fuzzySlop= { fuzzy=true; } ] ] - { - q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); - } - | ( ( {startInc=true;} | ) - ( goop1=|goop1= ) - [ ] - ( goop2=|goop2= ) - ( {endInc=true;} | )) - [ boost= ] - { - boolean startOpen=false; - boolean endOpen=false; - if (goop1.kind == RANGE_QUOTED) { - goop1.image = goop1.image.substring(1, goop1.image.length()-1); - } else if ("*".equals(goop1.image)) { - startOpen=true; - } - if (goop2.kind == RANGE_QUOTED) { - goop2.image = goop2.image.substring(1, goop2.image.length()-1); - } else if ("*".equals(goop2.image)) { - endOpen=true; - } - q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); - } - | term= - [ fuzzySlop= ] - [ boost= ] - { q = handleQuotedTerm(field, term, fuzzySlop); } + ( + term= + | term= { wildcard=true; } + | term= { prefix=true; } + | term= { wildcard=true; } + | term= { regexp=true; } + | term= + | term= { term.image = term.image.substring(0,1); } + ) + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); } + + | ( { startInc = true; } | ) + ( [ ] ( goop1= | goop1= ) ) + [ [ ] ] + ( [ ] ( goop2= | goop2= ) ) + ( [ ] ( { endInc = true; } | ) ) + [ [ ] [ ] boost= ] + { + boolean startOpen=false; + boolean endOpen=false; + if (goop1.kind == RANGE_QUOTED) { + goop1.image = goop1.image.substring(1, goop1.image.length()-1); + } else if ("*".equals(goop1.image)) { + startOpen=true; + } + if (goop2.kind == RANGE_QUOTED) { + goop2.image = goop2.image.substring(1, goop2.image.length()-1); + } else if ("*".equals(goop2.image)) { + endOpen=true; + } + q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); + } + + | term= + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleQuotedTerm(field, term, fuzzySlop); } ) { return handleBoost(q, boost); } } + +/** Returns the first query if splitOnWhitespace=true or otherwise the entire produced query */ +Query MultiTerm(String field, List clauses) : { + Token text, whitespace, followingText; + Query firstQuery = null; +} +{ + [ ] + text= + { + if (splitOnWhitespace) { + firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false); + addClause(clauses, CONJ_NONE, MOD_NONE, firstQuery); + } + } + // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + ( + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + whitespace= followingText= + { + if (splitOnWhitespace) { + Query q = getFieldQuery(field, discardEscapeChar(followingText.image), false); + addClause(clauses, CONJ_NONE, MOD_NONE, q); + } else { // build up the text to send to analysis + text.image += whitespace.image + followingText.image; + } + } + )+ + { + if (splitOnWhitespace == false) { + firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false); + addMultiTermClauses(clauses, firstQuery); + } + return firstQuery; + } +} \ No newline at end of file diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index c00d88e..cdfa477 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -465,6 +465,45 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer } /** + * Adds clauses generated from analysis over text containing whitespace. + * There are no operators, so the query's clauses can either be MUST (if the + * default operator is AND) or SHOULD (default OR). + * + * If all of the clauses in the given Query are TermQuery-s, this method flattens the result + * by adding the TermQuery-s individually to the output clause list; otherwise, the given Query + * is added as a single clause including its nested clauses. + */ + protected void addMultiTermClauses(List clauses, Query q) { + // We might have been passed a null query; the term might have been + // filtered away by the analyzer. + if (q == null) { + return; + } + boolean allNestedTermQueries = false; + if (q instanceof BooleanQuery) { + allNestedTermQueries = true; + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + if ( ! (clause.getQuery() instanceof TermQuery)) { + allNestedTermQueries = false; + break; + } + } + } + if (allNestedTermQueries) { + clauses.addAll(((BooleanQuery)q).clauses()); + } else { + BooleanClause.Occur occur = operator == OR_OPERATOR ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST; + if (q instanceof BooleanQuery) { + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + clauses.add(newBooleanClause(clause.getQuery(), occur)); + } + } else { + clauses.add(newBooleanClause(q, occur)); + } + } + } + + /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java index 5b4eba8..c3d7b37 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java @@ -18,6 +18,7 @@ package org.apache.lucene.queryparser.classic; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockSynonymAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -33,6 +34,7 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; @@ -44,7 +46,9 @@ import java.io.IOException; * Tests QueryParser. */ public class TestQueryParser extends QueryParserTestBase { - + + protected boolean splitOnWhitespace = QueryParser.DEFAULT_SPLIT_ON_WHITESPACE; + public static class QPTestParser extends QueryParser { public QPTestParser(String f, Analyzer a) { super(f, a); @@ -67,6 +71,7 @@ public class TestQueryParser extends QueryParserTestBase { if (a == null) a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); QueryParser qp = new QueryParser(getDefaultField(), a); qp.setDefaultOperator(QueryParserBase.OR_OPERATOR); + qp.setSplitOnWhitespace(splitOnWhitespace); return qp; } @@ -310,18 +315,7 @@ public class TestQueryParser extends QueryParserTestBase { Query unexpanded = new TermQuery(new Term("field", "dogs")); assertEquals(unexpanded, smart.parse("\"dogs\"")); } - - // TODO: fold these into QueryParserTestBase - - /** adds synonym of "dog" for "dogs". */ - static class MockSynonymAnalyzer extends Analyzer { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - MockTokenizer tokenizer = new MockTokenizer(); - return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); - } - } - + /** simple synonyms test */ public void testSynonyms() throws Exception { Query expected = new SynonymQuery(new Term("field", "dogs"), new Term("field", "dog")); @@ -483,4 +477,229 @@ public class TestQueryParser extends QueryParserTestBase { qp.parse("a*aaaaaaa"); }); } -} + + // TODO: Remove this specialization once the flexible standard parser gets multi-word synonym support + @Override + public void testQPA() throws Exception { + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = false; + + assertQueryEquals("term phrase term", qpAnalyzer, "term phrase1 phrase2 term"); + + CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); + setDefaultOperatorAND(cqpc); + assertQueryEquals(cqpc, "field", "term phrase term", "+term +phrase1 +phrase2 +term"); + + splitOnWhitespace = oldSplitOnWhitespace; + } + + // TODO: Move to QueryParserTestBase once standard flexible parser gets this capability + public void testMultiWordSynonyms() throws Exception { + QueryParser dumb = new QueryParser("field", new Analyzer1()); + dumb.setSplitOnWhitespace(false); + + // A multi-word synonym source will form a synonym query for the same-starting-position tokens + BooleanQuery.Builder multiWordExpandedBqBuilder = new BooleanQuery.Builder(); + Query multiWordSynonymQuery = new SynonymQuery(new Term("field", "guinea"), new Term("field", "cavy")); + multiWordExpandedBqBuilder.add(multiWordSynonymQuery, BooleanClause.Occur.SHOULD); + multiWordExpandedBqBuilder.add(new TermQuery(new Term("field", "pig")), BooleanClause.Occur.SHOULD); + Query multiWordExpandedBq = multiWordExpandedBqBuilder.build(); + assertEquals(multiWordExpandedBq, dumb.parse("guinea pig")); + + // With the phrase operator, a multi-word synonym source will form a multiphrase query. + // When the number of expanded term(s) is different from that of the original term(s), this is not good. + MultiPhraseQuery.Builder multiWordExpandedMpqBuilder = new MultiPhraseQuery.Builder(); + multiWordExpandedMpqBuilder.add(new Term[]{new Term("field", "guinea"), new Term("field", "cavy")}); + multiWordExpandedMpqBuilder.add(new Term("field", "pig")); + Query multiWordExpandedMPQ = multiWordExpandedMpqBuilder.build(); + assertEquals(multiWordExpandedMPQ, dumb.parse("\"guinea pig\"")); + + // custom behavior, the synonyms are expanded, unless you use quote operator + QueryParser smart = new SmartQueryParser(); + smart.setSplitOnWhitespace(false); + assertEquals(multiWordExpandedBq, smart.parse("guinea pig")); + + PhraseQuery.Builder multiWordUnexpandedPqBuilder = new PhraseQuery.Builder(); + multiWordUnexpandedPqBuilder.add(new Term("field", "guinea")); + multiWordUnexpandedPqBuilder.add(new Term("field", "pig")); + Query multiWordUnexpandedPq = multiWordUnexpandedPqBuilder.build(); + assertEquals(multiWordUnexpandedPq, smart.parse("\"guinea pig\"")); + } + + // TODO: Move to QueryParserTestBase once standard flexible parser gets this capability + public void testOperatorsAndMultiWordSynonyms() throws Exception { + Analyzer a = new MockSynonymAnalyzer(); + + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = false; + + // Operators should interrupt multiword analysis of adjacent words if they associate + assertQueryEquals("+guinea pig", a, "+guinea pig"); + assertQueryEquals("-guinea pig", a, "-guinea pig"); + assertQueryEquals("!guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea* pig", a, "guinea* pig"); + assertQueryEquals("guinea? pig", a, "guinea? pig"); + assertQueryEquals("guinea~2 pig", a, "guinea~2 pig"); + assertQueryEquals("guinea^2 pig", a, "(guinea)^2.0 pig"); + + assertQueryEquals("guinea +pig", a, "guinea +pig"); + assertQueryEquals("guinea -pig", a, "guinea -pig"); + assertQueryEquals("guinea !pig", a, "guinea -pig"); + assertQueryEquals("guinea pig*", a, "guinea pig*"); + assertQueryEquals("guinea pig?", a, "guinea pig?"); + assertQueryEquals("guinea pig~2", a, "guinea pig~2"); + assertQueryEquals("guinea pig^2", a, "guinea (pig)^2.0"); + + assertQueryEquals("field:guinea pig", a, "guinea pig"); + assertQueryEquals("guinea field:pig", a, "guinea pig"); + + assertQueryEquals("NOT guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea NOT pig", a, "guinea -pig"); + + assertQueryEquals("guinea pig AND dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs AND guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + assertQueryEquals("guinea pig && dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs && guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + + assertQueryEquals("guinea pig OR dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs OR guinea pig", a, "Synonym(dog dogs) guinea pig"); + assertQueryEquals("guinea pig || dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs || guinea pig", a, "Synonym(dog dogs) guinea pig"); + + assertQueryEquals("\"guinea\" pig", a, "guinea pig"); + assertQueryEquals("guinea \"pig\"", a, "guinea pig"); + + assertQueryEquals("(guinea) pig", a, "guinea pig"); + assertQueryEquals("guinea (pig)", a, "guinea pig"); + + assertQueryEquals("/guinea/ pig", a, "/guinea/ pig"); + assertQueryEquals("guinea /pig/", a, "guinea /pig/"); + + // Operators should not interrupt multiword analysis if not don't associate + assertQueryEquals("(guinea pig)", a, "Synonym(cavy guinea) pig"); + assertQueryEquals("+(guinea pig)", a, "+(Synonym(cavy guinea) pig)"); + assertQueryEquals("-(guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("!(guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("NOT (guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("(guinea pig)^2", a, "(Synonym(cavy guinea) pig)^2.0"); + + assertQueryEquals("field:(guinea pig)", a, "Synonym(cavy guinea) pig"); + + assertQueryEquals("+small guinea pig", a, "+small Synonym(cavy guinea) pig"); + assertQueryEquals("-small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("!small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("NOT small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("small* guinea pig", a, "small* Synonym(cavy guinea) pig"); + assertQueryEquals("small? guinea pig", a, "small? Synonym(cavy guinea) pig"); + assertQueryEquals("\"small\" guinea pig", a, "small Synonym(cavy guinea) pig"); + + assertQueryEquals("guinea pig +running", a, "Synonym(cavy guinea) pig +running"); + assertQueryEquals("guinea pig -running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig !running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig NOT running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig running*", a, "Synonym(cavy guinea) pig running*"); + assertQueryEquals("guinea pig running?", a, "Synonym(cavy guinea) pig running?"); + assertQueryEquals("guinea pig \"running\"", a, "Synonym(cavy guinea) pig running"); + + assertQueryEquals("\"guinea pig\"~2", a, "\"(guinea cavy) pig\"~2"); + + assertQueryEquals("field:\"guinea pig\"", a, "\"(guinea cavy) pig\""); + + splitOnWhitespace = oldSplitOnWhitespace; + } + + public void testOperatorsAndMultiWordSynonymsSplitOnWhitespace() throws Exception { + Analyzer a = new MockSynonymAnalyzer(); + + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = true; + + assertQueryEquals("+guinea pig", a, "+guinea pig"); + assertQueryEquals("-guinea pig", a, "-guinea pig"); + assertQueryEquals("!guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea* pig", a, "guinea* pig"); + assertQueryEquals("guinea? pig", a, "guinea? pig"); + assertQueryEquals("guinea~2 pig", a, "guinea~2 pig"); + assertQueryEquals("guinea^2 pig", a, "(guinea)^2.0 pig"); + + assertQueryEquals("guinea +pig", a, "guinea +pig"); + assertQueryEquals("guinea -pig", a, "guinea -pig"); + assertQueryEquals("guinea !pig", a, "guinea -pig"); + assertQueryEquals("guinea pig*", a, "guinea pig*"); + assertQueryEquals("guinea pig?", a, "guinea pig?"); + assertQueryEquals("guinea pig~2", a, "guinea pig~2"); + assertQueryEquals("guinea pig^2", a, "guinea (pig)^2.0"); + + assertQueryEquals("field:guinea pig", a, "guinea pig"); + assertQueryEquals("guinea field:pig", a, "guinea pig"); + + assertQueryEquals("NOT guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea NOT pig", a, "guinea -pig"); + + assertQueryEquals("guinea pig AND dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs AND guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + assertQueryEquals("guinea pig && dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs && guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + + assertQueryEquals("guinea pig OR dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs OR guinea pig", a, "Synonym(dog dogs) guinea pig"); + assertQueryEquals("guinea pig || dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs || guinea pig", a, "Synonym(dog dogs) guinea pig"); + + assertQueryEquals("\"guinea\" pig", a, "guinea pig"); + assertQueryEquals("guinea \"pig\"", a, "guinea pig"); + + assertQueryEquals("(guinea) pig", a, "guinea pig"); + assertQueryEquals("guinea (pig)", a, "guinea pig"); + + assertQueryEquals("/guinea/ pig", a, "/guinea/ pig"); + assertQueryEquals("guinea /pig/", a, "guinea /pig/"); + + assertQueryEquals("(guinea pig)", a, "guinea pig"); + assertQueryEquals("+(guinea pig)", a, "+(guinea pig)"); + assertQueryEquals("-(guinea pig)", a, "-(guinea pig)"); + assertQueryEquals("!(guinea pig)", a, "-(guinea pig)"); + assertQueryEquals("NOT (guinea pig)", a, "-(guinea pig)"); + assertQueryEquals("(guinea pig)^2", a, "(guinea pig)^2.0"); + + assertQueryEquals("field:(guinea pig)", a, "guinea pig"); + + assertQueryEquals("+small guinea pig", a, "+small guinea pig"); + assertQueryEquals("-small guinea pig", a, "-small guinea pig"); + assertQueryEquals("!small guinea pig", a, "-small guinea pig"); + assertQueryEquals("NOT small guinea pig", a, "-small guinea pig"); + assertQueryEquals("small* guinea pig", a, "small* guinea pig"); + assertQueryEquals("small? guinea pig", a, "small? guinea pig"); + assertQueryEquals("\"small\" guinea pig", a, "small guinea pig"); + + assertQueryEquals("guinea pig +running", a, "guinea pig +running"); + assertQueryEquals("guinea pig -running", a, "guinea pig -running"); + assertQueryEquals("guinea pig !running", a, "guinea pig -running"); + assertQueryEquals("guinea pig NOT running", a, "guinea pig -running"); + assertQueryEquals("guinea pig running*", a, "guinea pig running*"); + assertQueryEquals("guinea pig running?", a, "guinea pig running?"); + assertQueryEquals("guinea pig \"running\"", a, "guinea pig running"); + + assertQueryEquals("\"guinea pig\"~2", a, "\"(guinea cavy) pig\"~2"); + + assertQueryEquals("field:\"guinea pig\"", a, "\"(guinea cavy) pig\""); + + splitOnWhitespace = oldSplitOnWhitespace; + } + + public void testDefaultSplitOnWhitespace() throws Exception { + QueryParser parser = new QueryParser("field", new Analyzer1()); + + assertTrue(parser.getSplitOnWhitespace()); // default is true + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + bqBuilder.add(new TermQuery(new Term("field", "guinea")), BooleanClause.Occur.SHOULD); + bqBuilder.add(new TermQuery(new Term("field", "pig")), BooleanClause.Occur.SHOULD); + assertEquals(bqBuilder.build(), parser.parse("guinea pig")); + + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = QueryParser.DEFAULT_SPLIT_ON_WHITESPACE; + assertQueryEquals("guinea pig", new MockSynonymAnalyzer(), "guinea pig"); + splitOnWhitespace = oldSplitOnWhitespace; + } +} \ No newline at end of file diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java index 785dd1c..934a4da 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java @@ -50,6 +50,7 @@ public class TestExtendableQueryParser extends TestQueryParser { getDefaultField(), a) : new ExtendableQueryParser( getDefaultField(), a, extensions); qp.setDefaultOperator(QueryParserBase.OR_OPERATOR); + qp.setSplitOnWhitespace(splitOnWhitespace); return qp; } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java index 25c737f..78d2bfd 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java @@ -203,4 +203,15 @@ public class TestStandardQP extends QueryParserTestBase { //TODO test something like "SmartQueryParser()" } + // TODO: Remove this specialization once the flexible standard parser gets multi-word synonym support + @Override + public void testQPA() throws Exception { + super.testQPA(); + + assertQueryEquals("term phrase term", qpAnalyzer, "term (phrase1 phrase2) term"); + + CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); + setDefaultOperatorAND(cqpc); + assertQueryEquals(cqpc, "field", "term phrase term", "+term +(+phrase1 +phrase2) +term"); + } } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index 70dc15a..5f52190 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -535,8 +535,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertQueryEquals("term -(stop) term", qpAnalyzer, "term term"); assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "+drop +roll"); - assertQueryEquals("term phrase term", qpAnalyzer, - "term (phrase1 phrase2) term"); + +// TODO: Re-enable once flexible standard parser gets multi-word synonym support +// assertQueryEquals("term phrase term", qpAnalyzer, +// "term phrase1 phrase2 term"); assertQueryEquals("term AND NOT phrase term", qpAnalyzer, "+term -(phrase1 phrase2) term"); assertQueryEquals("stop^3", qpAnalyzer, ""); @@ -552,8 +554,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); setDefaultOperatorAND(cqpc); - assertQueryEquals(cqpc, "field", "term phrase term", - "+term +(+phrase1 +phrase2) +term"); +// TODO: Re-enable once flexible standard parser gets multi-word synonym support +// assertQueryEquals(cqpc, "field", "term phrase term", +// "+term +phrase1 +phrase2 +term"); assertQueryEquals(cqpc, "field", "phrase", "+phrase1 +phrase2"); } @@ -1101,37 +1104,6 @@ public abstract class QueryParserTestBase extends LuceneTestCase { dir.close(); } - /** - * adds synonym of "dog" for "dogs". - */ - protected static class MockSynonymFilter extends TokenFilter { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - boolean addSynonym = false; - - public MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (addSynonym) { // inject our synonym - clearAttributes(); - termAtt.setEmpty().append("dog"); - posIncAtt.setPositionIncrement(0); - addSynonym = false; - return true; - } - - if (input.incrementToken()) { - addSynonym = termAtt.toString().equals("dogs"); - return true; - } else { - return false; - } - } - } - /** whitespace+lowercase analyzer with synonyms */ protected class Analyzer1 extends Analyzer { public Analyzer1(){ @@ -1251,10 +1223,8 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CharacterRunAutomaton stopStopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton()); - CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); - - qp = getParserConfig( - new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); + CommonQueryParserConfiguration qp + = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); qp.setEnablePositionIncrements(true); PhraseQuery.Builder phraseQuery = new PhraseQuery.Builder();