diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj index c809f2c..0bafa15 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj @@ -19,6 +19,7 @@ options { STATIC=false; JAVA_UNICODE_ESCAPE=true; USER_CHAR_STREAM=true; + LOOKAHEAD=2; } PARSER_BEGIN(QueryParser) @@ -27,8 +28,11 @@ package org.apache.lucene.queryparser.classic; import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; @@ -114,6 +118,12 @@ public class QueryParser extends QueryParserBase { this(new FastCharStream(new StringReader(""))); init(f, a); } + + private static Set disallowedPostMultiTerm + = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR)); + private static boolean allowedPostMultiTerm(int tokenKind) { + return disallowedPostMultiTerm.contains(tokenKind) == false; + } } PARSER_END(QueryParser) @@ -123,53 +133,52 @@ PARSER_END(QueryParser) /* ***************** */ <*> TOKEN : { - <#_NUM_CHAR: ["0"-"9"] > -// every character that follows a backslash is considered as an escaped character -| <#_ESCAPED_CHAR: "\\" ~[] > + <#_NUM_CHAR: ["0"-"9"] > +| <#_ESCAPED_CHAR: "\\" ~[] > // every character that follows a backslash is considered as an escaped character | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ] - | <_ESCAPED_CHAR> ) > -| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > -| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > -| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > + | <_ESCAPED_CHAR> ) > +| <#_TERM_CHAR: ( <_TERM_START_CHAR> | "-" | "+" ) > +| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > +| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } - SKIP : { - < <_WHITESPACE>> + TOKEN : { + +| +| +| +| +| > +| +| +| +| +| : Boost +| )* "\""> +| (<_TERM_CHAR>)* > +| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > +| (<_TERM_CHAR>)* "*" ) > +| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > +| +| : Range +| : Range } - TOKEN : { - -| -| -| -| -| > -| -| -| -| -| : Boost -| )* "\""> -| (<_TERM_CHAR>)* > -| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > -| (<_TERM_CHAR>)* "*" ) > -| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > -| -| : Range -| : Range + TOKEN : { + )+ > } TOKEN : { -)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT + )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } TOKEN : { - -| : DEFAULT -| : DEFAULT + +| : DEFAULT +| : DEFAULT | -| +| } // * Query ::= ( Clause )* @@ -180,8 +189,9 @@ int Conjunction() : { } { [ - { ret = CONJ_AND; } - | { ret = CONJ_OR; } + [ ] + ( { ret = CONJ_AND; } + | { ret = CONJ_OR; } ) ] { return ret; } } @@ -191,23 +201,21 @@ int Modifiers() : { } { [ - { ret = MOD_REQ; } - | { ret = MOD_NOT; } - | { ret = MOD_NOT; } + [ ] + ( { ret = MOD_REQ; } + | { ret = MOD_NOT; } + | { ret = MOD_NOT; } ) ] { return ret; } } // This makes sure that there is no garbage after the query string -Query TopLevelQuery(String field) : -{ +Query TopLevelQuery(String field) : { Query q; } { - q=Query(field) - { - return q; - } + q=Query(field) [ ] + { return q; } } Query Query(String field) : @@ -217,23 +225,35 @@ Query Query(String field) : int conj, mods; } { - mods=Modifiers() q=Clause(field) - { - addClause(clauses, CONJ_NONE, mods, q); - if (mods == MOD_NONE) - firstQuery=q; - } ( - conj=Conjunction() mods=Modifiers() q=Clause(field) - { addClause(clauses, conj, mods, q); } - )* + LOOKAHEAD(3) + q=MultiTerm(field) { - if (clauses.size() == 1 && firstQuery != null) - return firstQuery; - else { - return getBooleanQuery(clauses); + addClause(clauses, CONJ_NONE, MOD_NONE, q); + firstQuery = q; + } + | mods=Modifiers() q=Clause(field) + { + addClause(clauses, CONJ_NONE, mods, q); + if (mods == MOD_NONE) { + firstQuery = q; + } } + ) + ( + LOOKAHEAD(3) + q=MultiTerm(field) + { addClause(clauses, CONJ_NONE, MOD_NONE, q); } + | conj=Conjunction() mods=Modifiers() q=Clause(field) + { addClause(clauses, conj, mods, q); } + )* + { + if (clauses.size() == 1 && firstQuery != null) { + return firstQuery; + } else { + return getBooleanQuery(clauses); } + } } Query Clause(String field) : { @@ -242,22 +262,22 @@ Query Clause(String field) : { } { [ - LOOKAHEAD(2) + LOOKAHEAD(4) ( - fieldToken= {field=discardEscapeChar(fieldToken.image);} - | {field="*";} + [ ] + fieldToken= [ ] { field = discardEscapeChar(fieldToken.image); } + | [ ] { field = "*"; } ) ] - ( - q=Term(field) - | q=Query(field) ( boost=)? - + q=Term(field) + | [ ] q=Query(field) + [ ] + [ [ ] [ ] boost= ] ) - { return handleBoost(q, boost); } + { return handleBoost(q, boost); } } - Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; @@ -269,46 +289,85 @@ Query Term(String field) : { Query q; } { + [ ] ( - ( - term= - | term= { wildcard=true; } - | term= { prefix=true; } - | term= { wildcard=true; } - | term= { regexp=true; } - | term= - | term= { term.image = term.image.substring(0,1); } - ) - [ fuzzySlop= { fuzzy=true; } ] - [ boost= [ fuzzySlop= { fuzzy=true; } ] ] - { - q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); - } - | ( ( {startInc=true;} | ) - ( goop1=|goop1= ) - [ ] - ( goop2=|goop2= ) - ( {endInc=true;} | )) - [ boost= ] - { - boolean startOpen=false; - boolean endOpen=false; - if (goop1.kind == RANGE_QUOTED) { - goop1.image = goop1.image.substring(1, goop1.image.length()-1); - } else if ("*".equals(goop1.image)) { - startOpen=true; - } - if (goop2.kind == RANGE_QUOTED) { - goop2.image = goop2.image.substring(1, goop2.image.length()-1); - } else if ("*".equals(goop2.image)) { - endOpen=true; - } - q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); - } - | term= - [ fuzzySlop= ] - [ boost= ] - { q = handleQuotedTerm(field, term, fuzzySlop); } + ( + term= + | term= { wildcard=true; } + | term= { prefix=true; } + | term= { wildcard=true; } + | term= { regexp=true; } + | term= + | term= { term.image = term.image.substring(0,1); } + ) + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); } + + | ( { startInc = true; } | ) + ( [ ] ( goop1= | goop1= ) ) + [ [ ] ] + ( [ ] ( goop2= | goop2= ) ) + ( [ ] ( { endInc = true; } | ) ) + [ [ ] [ ] boost= ] + { + boolean startOpen=false; + boolean endOpen=false; + if (goop1.kind == RANGE_QUOTED) { + goop1.image = goop1.image.substring(1, goop1.image.length()-1); + } else if ("*".equals(goop1.image)) { + startOpen=true; + } + if (goop2.kind == RANGE_QUOTED) { + goop2.image = goop2.image.substring(1, goop2.image.length()-1); + } else if ("*".equals(goop2.image)) { + endOpen=true; + } + q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); + } + + | term= + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleQuotedTerm(field, term, fuzzySlop); } ) { return handleBoost(q, boost); } } + +Query MultiTerm(String field) : { + Token term, whitespace, followingTerm; + Query q; +} +{ + [ ] + term= + // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + ( + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + whitespace= + followingTerm= + { term.image += whitespace.image + followingTerm.image; } + )+ + { return getFieldQuery(field, discardEscapeChar(term.image), false); } +} \ No newline at end of file