diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java index b9963ec..69a7559 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java @@ -27,6 +27,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; /** * A QueryParser which constructs queries to search multiple fields. @@ -148,18 +149,54 @@ public class MultiFieldQueryParser extends QueryParser protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { if (field == null) { List clauses = new ArrayList<>(); + Query[] fieldQueries = new Query[fields.length]; + int maxTerms = 0; for (int i = 0; i < fields.length; i++) { Query q = super.getFieldQuery(fields[i], queryText, quoted); if (q != null) { - //If the user passes a map of boosts - if (boosts != null) { - //Get the boost from the map and apply them - Float boost = boosts.get(fields[i]); - if (boost != null) { - q = new BoostQuery(q, boost.floatValue()); + if (q instanceof TermQuery) { + maxTerms = Math.max(1, maxTerms); + } else if (q instanceof BooleanQuery) { + maxTerms = Math.max(maxTerms, ((BooleanQuery)q).clauses().size()); + } + fieldQueries[i] = q; + } + } + for (int termNum = 0; termNum < maxTerms; termNum++) { + List termClauses = new ArrayList<>(); + for (int i = 0; i < fields.length; i++) { + if (fieldQueries[i] != null) { + Query q = null; + if (fieldQueries[i] instanceof BooleanQuery) { + List nestedClauses = ((BooleanQuery)fieldQueries[i]).clauses(); + if (termNum < nestedClauses.size()) { + q = nestedClauses.get(termNum).getQuery(); + } + } else if (termNum == 0) { // e.g. TermQuery-s + q = fieldQueries[i]; + } + if (q != null) { + if (boosts != null) { + //Get the boost from the map and apply them + Float boost = boosts.get(fields[i]); + if (boost != null) { + q = new BoostQuery(q, boost); + } + } + termClauses.add(q); } } - clauses.add(q); + } + if (maxTerms > 1) { + if (termClauses.size() > 0) { + BooleanQuery.Builder builder = newBooleanQuery(); + for (Query termClause : termClauses) { + builder.add(termClause, BooleanClause.Occur.SHOULD); + } + clauses.add(builder.build()); + } + } else { + clauses.addAll(termClauses); } } if (clauses.size() == 0) // happens for stopwords diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj index c809f2c..c9d8c08 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj @@ -19,6 +19,7 @@ options { STATIC=false; JAVA_UNICODE_ESCAPE=true; USER_CHAR_STREAM=true; + LOOKAHEAD=2; } PARSER_BEGIN(QueryParser) @@ -27,8 +28,11 @@ package org.apache.lucene.queryparser.classic; import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; @@ -114,6 +118,12 @@ public class QueryParser extends QueryParserBase { this(new FastCharStream(new StringReader(""))); init(f, a); } + + private static Set disallowedPostMultiTerm + = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR)); + private static boolean allowedPostMultiTerm(int tokenKind) { + return disallowedPostMultiTerm.contains(tokenKind) == false; + } } PARSER_END(QueryParser) @@ -123,53 +133,56 @@ PARSER_END(QueryParser) /* ***************** */ <*> TOKEN : { - <#_NUM_CHAR: ["0"-"9"] > -// every character that follows a backslash is considered as an escaped character -| <#_ESCAPED_CHAR: "\\" ~[] > + <#_NUM_CHAR: ["0"-"9"] > +| <#_ESCAPED_CHAR: "\\" ~[] > // every character that follows a backslash is considered as an escaped character | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ] - | <_ESCAPED_CHAR> ) > -| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > -| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > -| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > -} - - SKIP : { - < <_WHITESPACE>> + | <_ESCAPED_CHAR> ) > +| <#_TERM_CHAR: ( <_TERM_START_CHAR> | "-" | "+" ) > +| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > +| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } TOKEN : { - -| -| -| -| -| > -| -| -| -| -| : Boost -| )* "\""> -| (<_TERM_CHAR>)* > -| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > -| (<_TERM_CHAR>)* "*" ) > -| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > -| + +| +| +| +| +| > +| +| +| +| +| : Boost +| )* "\""> +| (<_TERM_CHAR>)* > +| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > +| (<_TERM_CHAR>)* "*" ) > +| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > +| | : Range | : Range } +// Whitespace handling strategy: tokenize whitespace sequences and accept 0 or 1 whitespace +// token at the beginning of every rule, inbetween tokens in rules, and - only once, +// in the TopLevelQuery rule! - just before . Note that no whitespace is allowed +// between the caret and the following boost value. + TOKEN : { + )+ > +} + TOKEN : { -)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT + )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } TOKEN : { - -| : DEFAULT -| : DEFAULT + +| : DEFAULT +| : DEFAULT | -| +| } // * Query ::= ( Clause )* @@ -180,8 +193,9 @@ int Conjunction() : { } { [ - { ret = CONJ_AND; } - | { ret = CONJ_OR; } + [ ] + ( { ret = CONJ_AND; } + | { ret = CONJ_OR; } ) ] { return ret; } } @@ -191,23 +205,21 @@ int Modifiers() : { } { [ - { ret = MOD_REQ; } - | { ret = MOD_NOT; } - | { ret = MOD_NOT; } + [ ] + ( { ret = MOD_REQ; } + | { ret = MOD_NOT; } + | { ret = MOD_NOT; } ) ] { return ret; } } // This makes sure that there is no garbage after the query string -Query TopLevelQuery(String field) : -{ +Query TopLevelQuery(String field) : { Query q; } { - q=Query(field) - { - return q; - } + q=Query(field) [ ] + { return q; } } Query Query(String field) : @@ -217,23 +229,35 @@ Query Query(String field) : int conj, mods; } { - mods=Modifiers() q=Clause(field) - { - addClause(clauses, CONJ_NONE, mods, q); - if (mods == MOD_NONE) - firstQuery=q; - } ( - conj=Conjunction() mods=Modifiers() q=Clause(field) - { addClause(clauses, conj, mods, q); } - )* + LOOKAHEAD(3) + q=MultiTerm(field) { - if (clauses.size() == 1 && firstQuery != null) - return firstQuery; - else { - return getBooleanQuery(clauses); + addMultiTermClauses(clauses, q); + firstQuery = q; + } + | mods=Modifiers() q=Clause(field) + { + addClause(clauses, CONJ_NONE, mods, q); + if (mods == MOD_NONE) { + firstQuery = q; + } } + ) + ( + LOOKAHEAD(3) + q=MultiTerm(field) + { addMultiTermClauses(clauses, q); } + | conj=Conjunction() mods=Modifiers() q=Clause(field) + { addClause(clauses, conj, mods, q); } + )* + { + if (clauses.size() == 1 && firstQuery != null) { + return firstQuery; + } else { + return getBooleanQuery(clauses); } + } } Query Clause(String field) : { @@ -242,22 +266,22 @@ Query Clause(String field) : { } { [ - LOOKAHEAD(2) + LOOKAHEAD(4) ( - fieldToken= {field=discardEscapeChar(fieldToken.image);} - | {field="*";} + [ ] + fieldToken= [ ] { field = discardEscapeChar(fieldToken.image); } + | [ ] { field = "*"; } ) ] - ( - q=Term(field) - | q=Query(field) ( boost=)? - + q=Term(field) + | [ ] q=Query(field) + [ ] + [ [ ] [ ] boost= ] ) - { return handleBoost(q, boost); } + { return handleBoost(q, boost); } } - Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; @@ -269,46 +293,85 @@ Query Term(String field) : { Query q; } { + [ ] ( - ( - term= - | term= { wildcard=true; } - | term= { prefix=true; } - | term= { wildcard=true; } - | term= { regexp=true; } - | term= - | term= { term.image = term.image.substring(0,1); } - ) - [ fuzzySlop= { fuzzy=true; } ] - [ boost= [ fuzzySlop= { fuzzy=true; } ] ] - { - q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); - } - | ( ( {startInc=true;} | ) - ( goop1=|goop1= ) - [ ] - ( goop2=|goop2= ) - ( {endInc=true;} | )) - [ boost= ] - { - boolean startOpen=false; - boolean endOpen=false; - if (goop1.kind == RANGE_QUOTED) { - goop1.image = goop1.image.substring(1, goop1.image.length()-1); - } else if ("*".equals(goop1.image)) { - startOpen=true; - } - if (goop2.kind == RANGE_QUOTED) { - goop2.image = goop2.image.substring(1, goop2.image.length()-1); - } else if ("*".equals(goop2.image)) { - endOpen=true; - } - q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); - } - | term= - [ fuzzySlop= ] - [ boost= ] - { q = handleQuotedTerm(field, term, fuzzySlop); } + ( + term= + | term= { wildcard=true; } + | term= { prefix=true; } + | term= { wildcard=true; } + | term= { regexp=true; } + | term= + | term= { term.image = term.image.substring(0,1); } + ) + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); } + + | ( { startInc = true; } | ) + ( [ ] ( goop1= | goop1= ) ) + [ [ ] ] + ( [ ] ( goop2= | goop2= ) ) + ( [ ] ( { endInc = true; } | ) ) + [ [ ] [ ] boost= ] + { + boolean startOpen=false; + boolean endOpen=false; + if (goop1.kind == RANGE_QUOTED) { + goop1.image = goop1.image.substring(1, goop1.image.length()-1); + } else if ("*".equals(goop1.image)) { + startOpen=true; + } + if (goop2.kind == RANGE_QUOTED) { + goop2.image = goop2.image.substring(1, goop2.image.length()-1); + } else if ("*".equals(goop2.image)) { + endOpen=true; + } + q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); + } + + | term= + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleQuotedTerm(field, term, fuzzySlop); } ) { return handleBoost(q, boost); } } + +Query MultiTerm(String field) : { + Token term, whitespace, followingTerm; + Query q; +} +{ + [ ] + term= + // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + ( + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + whitespace= + followingTerm= + { term.image += whitespace.image + followingTerm.image; } + )+ + { return getFieldQuery(field, discardEscapeChar(term.image), false); } +} \ No newline at end of file diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index c00d88e..cdfa477 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -465,6 +465,45 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer } /** + * Adds clauses generated from analysis over text containing whitespace. + * There are no operators, so the query's clauses can either be MUST (if the + * default operator is AND) or SHOULD (default OR). + * + * If all of the clauses in the given Query are TermQuery-s, this method flattens the result + * by adding the TermQuery-s individually to the output clause list; otherwise, the given Query + * is added as a single clause including its nested clauses. + */ + protected void addMultiTermClauses(List clauses, Query q) { + // We might have been passed a null query; the term might have been + // filtered away by the analyzer. + if (q == null) { + return; + } + boolean allNestedTermQueries = false; + if (q instanceof BooleanQuery) { + allNestedTermQueries = true; + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + if ( ! (clause.getQuery() instanceof TermQuery)) { + allNestedTermQueries = false; + break; + } + } + } + if (allNestedTermQueries) { + clauses.addAll(((BooleanQuery)q).clauses()); + } else { + BooleanClause.Occur occur = operator == OR_OPERATOR ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST; + if (q instanceof BooleanQuery) { + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + clauses.add(newBooleanClause(clause.getQuery(), occur)); + } + } else { + clauses.add(newBooleanClause(q, occur)); + } + } + } + + /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java index d4d8b93..48513a1 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java @@ -76,16 +76,16 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { Query q = mfqp.parse("one"); assertEquals("b:one t:one", q.toString()); - + q = mfqp.parse("one two"); assertEquals("(b:one t:one) (b:two t:two)", q.toString()); - + q = mfqp.parse("+one +two"); assertEquals("+(b:one t:one) +(b:two t:two)", q.toString()); q = mfqp.parse("+one -two -three"); assertEquals("+(b:one t:one) -(b:two t:two) -(b:three t:three)", q.toString()); - + q = mfqp.parse("one^2 two"); assertEquals("(b:one t:one)^2.0 (b:two t:two)", q.toString()); @@ -114,7 +114,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { assertEquals("b:\"foo bar\"~4 t:\"foo bar\"~4", q.toString()); // LUCENE-1213: MultiFieldQueryParser was ignoring slop when phrase had a field. - q = mfqp.parse("b:\"foo bar\"~4"); + q = mfqp.parse("b:\"foo bar\"~4"); assertEquals("b:\"foo bar\"~4", q.toString()); // make sure that terms which have a field are not touched: diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index 70dc15a..35ab96b 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -536,7 +536,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "+drop +roll"); assertQueryEquals("term phrase term", qpAnalyzer, - "term (phrase1 phrase2) term"); + "term phrase1 phrase2 term"); assertQueryEquals("term AND NOT phrase term", qpAnalyzer, "+term -(phrase1 phrase2) term"); assertQueryEquals("stop^3", qpAnalyzer, ""); @@ -553,7 +553,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); setDefaultOperatorAND(cqpc); assertQueryEquals(cqpc, "field", "term phrase term", - "+term +(+phrase1 +phrase2) +term"); + "+term +phrase1 +phrase2 +term"); assertQueryEquals(cqpc, "field", "phrase", "+phrase1 +phrase2"); }