diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java index 25b89d9..c4203ff 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java @@ -22,7 +22,6 @@ import java.util.ArrayList; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -137,40 +136,4 @@ public class TestStopFilter extends BaseTokenStreamTestCase { System.out.println(s); } } - - // stupid filter that inserts synonym of 'hte' for 'the' - private class MockSynonymFilter extends TokenFilter { - State bufferedState; - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - - MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (bufferedState != null) { - restoreState(bufferedState); - posIncAtt.setPositionIncrement(0); - termAtt.setEmpty().append("hte"); - bufferedState = null; - return true; - } else if (input.incrementToken()) { - if (termAtt.toString().equals("the")) { - bufferedState = captureState(); - } - return true; - } else { - return false; - } - } - - @Override - public void reset() throws IOException { - super.reset(); - bufferedState = null; - } - } - } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java index 205fbab..d3019e3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockSynonymFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -121,7 +122,7 @@ public class TestQueryBuilder extends LuceneTestCase { assertNull(builder.createBooleanQuery("field", "")); } - /** adds synonym of "dog" for "dogs". */ + /** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ static class MockSynonymAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { @@ -130,37 +131,6 @@ public class TestQueryBuilder extends LuceneTestCase { } } - /** - * adds synonym of "dog" for "dogs". - */ - protected static class MockSynonymFilter extends TokenFilter { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - boolean addSynonym = false; - - public MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (addSynonym) { // inject our synonym - clearAttributes(); - termAtt.setEmpty().append("dog"); - posIncAtt.setPositionIncrement(0); - addSynonym = false; - return true; - } - - if (input.incrementToken()) { - addSynonym = termAtt.toString().equals("dogs"); - return true; - } else { - return false; - } - } - } - /** simple synonyms test */ public void testSynonyms() throws Exception { SynonymQuery expected = new SynonymQuery(new Term("field", "dogs"), new Term("field", "dog")); @@ -180,6 +150,15 @@ public class TestQueryBuilder extends LuceneTestCase { assertEquals(expectedBuilder.build(), builder.createPhraseQuery("field", "old dogs")); } + /** forms multiphrase query */ + public void testMultiWordSynonymsPhrase() throws Exception { + MultiPhraseQuery.Builder expectedBuilder = new MultiPhraseQuery.Builder(); + expectedBuilder.add(new Term[] { new Term("field", "guinea"), new Term("field", "cavy") }); + expectedBuilder.add(new Term("field", "pig")); + QueryBuilder queryBuilder = new QueryBuilder(new MockSynonymAnalyzer()); + assertEquals(expectedBuilder.build(), queryBuilder.createPhraseQuery("field", "guinea pig")); + } + protected static class SimpleCJKTokenizer extends Tokenizer { private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java index b9963ec..69a7559 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java @@ -27,6 +27,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; /** * A QueryParser which constructs queries to search multiple fields. @@ -148,18 +149,54 @@ public class MultiFieldQueryParser extends QueryParser protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { if (field == null) { List clauses = new ArrayList<>(); + Query[] fieldQueries = new Query[fields.length]; + int maxTerms = 0; for (int i = 0; i < fields.length; i++) { Query q = super.getFieldQuery(fields[i], queryText, quoted); if (q != null) { - //If the user passes a map of boosts - if (boosts != null) { - //Get the boost from the map and apply them - Float boost = boosts.get(fields[i]); - if (boost != null) { - q = new BoostQuery(q, boost.floatValue()); + if (q instanceof TermQuery) { + maxTerms = Math.max(1, maxTerms); + } else if (q instanceof BooleanQuery) { + maxTerms = Math.max(maxTerms, ((BooleanQuery)q).clauses().size()); + } + fieldQueries[i] = q; + } + } + for (int termNum = 0; termNum < maxTerms; termNum++) { + List termClauses = new ArrayList<>(); + for (int i = 0; i < fields.length; i++) { + if (fieldQueries[i] != null) { + Query q = null; + if (fieldQueries[i] instanceof BooleanQuery) { + List nestedClauses = ((BooleanQuery)fieldQueries[i]).clauses(); + if (termNum < nestedClauses.size()) { + q = nestedClauses.get(termNum).getQuery(); + } + } else if (termNum == 0) { // e.g. TermQuery-s + q = fieldQueries[i]; + } + if (q != null) { + if (boosts != null) { + //Get the boost from the map and apply them + Float boost = boosts.get(fields[i]); + if (boost != null) { + q = new BoostQuery(q, boost); + } + } + termClauses.add(q); } } - clauses.add(q); + } + if (maxTerms > 1) { + if (termClauses.size() > 0) { + BooleanQuery.Builder builder = newBooleanQuery(); + for (Query termClause : termClauses) { + builder.add(termClause, BooleanClause.Occur.SHOULD); + } + clauses.add(builder.build()); + } + } else { + clauses.addAll(termClauses); } } if (clauses.size() == 0) // happens for stopwords diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj index c809f2c..c9d8c08 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj @@ -19,6 +19,7 @@ options { STATIC=false; JAVA_UNICODE_ESCAPE=true; USER_CHAR_STREAM=true; + LOOKAHEAD=2; } PARSER_BEGIN(QueryParser) @@ -27,8 +28,11 @@ package org.apache.lucene.queryparser.classic; import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; @@ -114,6 +118,12 @@ public class QueryParser extends QueryParserBase { this(new FastCharStream(new StringReader(""))); init(f, a); } + + private static Set disallowedPostMultiTerm + = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR)); + private static boolean allowedPostMultiTerm(int tokenKind) { + return disallowedPostMultiTerm.contains(tokenKind) == false; + } } PARSER_END(QueryParser) @@ -123,53 +133,56 @@ PARSER_END(QueryParser) /* ***************** */ <*> TOKEN : { - <#_NUM_CHAR: ["0"-"9"] > -// every character that follows a backslash is considered as an escaped character -| <#_ESCAPED_CHAR: "\\" ~[] > + <#_NUM_CHAR: ["0"-"9"] > +| <#_ESCAPED_CHAR: "\\" ~[] > // every character that follows a backslash is considered as an escaped character | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ] - | <_ESCAPED_CHAR> ) > -| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > -| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > -| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > -} - - SKIP : { - < <_WHITESPACE>> + | <_ESCAPED_CHAR> ) > +| <#_TERM_CHAR: ( <_TERM_START_CHAR> | "-" | "+" ) > +| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > +| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } TOKEN : { - -| -| -| -| -| > -| -| -| -| -| : Boost -| )* "\""> -| (<_TERM_CHAR>)* > -| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > -| (<_TERM_CHAR>)* "*" ) > -| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > -| + +| +| +| +| +| > +| +| +| +| +| : Boost +| )* "\""> +| (<_TERM_CHAR>)* > +| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > +| (<_TERM_CHAR>)* "*" ) > +| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > +| | : Range | : Range } +// Whitespace handling strategy: tokenize whitespace sequences and accept 0 or 1 whitespace +// token at the beginning of every rule, inbetween tokens in rules, and - only once, +// in the TopLevelQuery rule! - just before . Note that no whitespace is allowed +// between the caret and the following boost value. + TOKEN : { + )+ > +} + TOKEN : { -)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT + )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } TOKEN : { - -| : DEFAULT -| : DEFAULT + +| : DEFAULT +| : DEFAULT | -| +| } // * Query ::= ( Clause )* @@ -180,8 +193,9 @@ int Conjunction() : { } { [ - { ret = CONJ_AND; } - | { ret = CONJ_OR; } + [ ] + ( { ret = CONJ_AND; } + | { ret = CONJ_OR; } ) ] { return ret; } } @@ -191,23 +205,21 @@ int Modifiers() : { } { [ - { ret = MOD_REQ; } - | { ret = MOD_NOT; } - | { ret = MOD_NOT; } + [ ] + ( { ret = MOD_REQ; } + | { ret = MOD_NOT; } + | { ret = MOD_NOT; } ) ] { return ret; } } // This makes sure that there is no garbage after the query string -Query TopLevelQuery(String field) : -{ +Query TopLevelQuery(String field) : { Query q; } { - q=Query(field) - { - return q; - } + q=Query(field) [ ] + { return q; } } Query Query(String field) : @@ -217,23 +229,35 @@ Query Query(String field) : int conj, mods; } { - mods=Modifiers() q=Clause(field) - { - addClause(clauses, CONJ_NONE, mods, q); - if (mods == MOD_NONE) - firstQuery=q; - } ( - conj=Conjunction() mods=Modifiers() q=Clause(field) - { addClause(clauses, conj, mods, q); } - )* + LOOKAHEAD(3) + q=MultiTerm(field) { - if (clauses.size() == 1 && firstQuery != null) - return firstQuery; - else { - return getBooleanQuery(clauses); + addMultiTermClauses(clauses, q); + firstQuery = q; + } + | mods=Modifiers() q=Clause(field) + { + addClause(clauses, CONJ_NONE, mods, q); + if (mods == MOD_NONE) { + firstQuery = q; + } } + ) + ( + LOOKAHEAD(3) + q=MultiTerm(field) + { addMultiTermClauses(clauses, q); } + | conj=Conjunction() mods=Modifiers() q=Clause(field) + { addClause(clauses, conj, mods, q); } + )* + { + if (clauses.size() == 1 && firstQuery != null) { + return firstQuery; + } else { + return getBooleanQuery(clauses); } + } } Query Clause(String field) : { @@ -242,22 +266,22 @@ Query Clause(String field) : { } { [ - LOOKAHEAD(2) + LOOKAHEAD(4) ( - fieldToken= {field=discardEscapeChar(fieldToken.image);} - | {field="*";} + [ ] + fieldToken= [ ] { field = discardEscapeChar(fieldToken.image); } + | [ ] { field = "*"; } ) ] - ( - q=Term(field) - | q=Query(field) ( boost=)? - + q=Term(field) + | [ ] q=Query(field) + [ ] + [ [ ] [ ] boost= ] ) - { return handleBoost(q, boost); } + { return handleBoost(q, boost); } } - Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; @@ -269,46 +293,85 @@ Query Term(String field) : { Query q; } { + [ ] ( - ( - term= - | term= { wildcard=true; } - | term= { prefix=true; } - | term= { wildcard=true; } - | term= { regexp=true; } - | term= - | term= { term.image = term.image.substring(0,1); } - ) - [ fuzzySlop= { fuzzy=true; } ] - [ boost= [ fuzzySlop= { fuzzy=true; } ] ] - { - q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); - } - | ( ( {startInc=true;} | ) - ( goop1=|goop1= ) - [ ] - ( goop2=|goop2= ) - ( {endInc=true;} | )) - [ boost= ] - { - boolean startOpen=false; - boolean endOpen=false; - if (goop1.kind == RANGE_QUOTED) { - goop1.image = goop1.image.substring(1, goop1.image.length()-1); - } else if ("*".equals(goop1.image)) { - startOpen=true; - } - if (goop2.kind == RANGE_QUOTED) { - goop2.image = goop2.image.substring(1, goop2.image.length()-1); - } else if ("*".equals(goop2.image)) { - endOpen=true; - } - q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); - } - | term= - [ fuzzySlop= ] - [ boost= ] - { q = handleQuotedTerm(field, term, fuzzySlop); } + ( + term= + | term= { wildcard=true; } + | term= { prefix=true; } + | term= { wildcard=true; } + | term= { regexp=true; } + | term= + | term= { term.image = term.image.substring(0,1); } + ) + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); } + + | ( { startInc = true; } | ) + ( [ ] ( goop1= | goop1= ) ) + [ [ ] ] + ( [ ] ( goop2= | goop2= ) ) + ( [ ] ( { endInc = true; } | ) ) + [ [ ] [ ] boost= ] + { + boolean startOpen=false; + boolean endOpen=false; + if (goop1.kind == RANGE_QUOTED) { + goop1.image = goop1.image.substring(1, goop1.image.length()-1); + } else if ("*".equals(goop1.image)) { + startOpen=true; + } + if (goop2.kind == RANGE_QUOTED) { + goop2.image = goop2.image.substring(1, goop2.image.length()-1); + } else if ("*".equals(goop2.image)) { + endOpen=true; + } + q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); + } + + | term= + [ + [ ] + ( + [ ] boost= + [ [ ] fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } + [ [ ] [ ] boost= ] + ) + ] + { q = handleQuotedTerm(field, term, fuzzySlop); } ) { return handleBoost(q, boost); } } + +Query MultiTerm(String field) : { + Token term, whitespace, followingTerm; + Query q; +} +{ + [ ] + term= + // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + ( + LOOKAHEAD({ getToken(1).kind == WHITESPACE_SEQ + && getToken(2).kind == TERM + && ( (getToken(3).kind != WHITESPACE_SEQ && allowedPostMultiTerm(getToken(3).kind)) + || (getToken(3).kind == WHITESPACE_SEQ && allowedPostMultiTerm(getToken(4).kind))) }) + whitespace= + followingTerm= + { term.image += whitespace.image + followingTerm.image; } + )+ + { return getFieldQuery(field, discardEscapeChar(term.image), false); } +} \ No newline at end of file diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index c00d88e..cdfa477 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -465,6 +465,45 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer } /** + * Adds clauses generated from analysis over text containing whitespace. + * There are no operators, so the query's clauses can either be MUST (if the + * default operator is AND) or SHOULD (default OR). + * + * If all of the clauses in the given Query are TermQuery-s, this method flattens the result + * by adding the TermQuery-s individually to the output clause list; otherwise, the given Query + * is added as a single clause including its nested clauses. + */ + protected void addMultiTermClauses(List clauses, Query q) { + // We might have been passed a null query; the term might have been + // filtered away by the analyzer. + if (q == null) { + return; + } + boolean allNestedTermQueries = false; + if (q instanceof BooleanQuery) { + allNestedTermQueries = true; + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + if ( ! (clause.getQuery() instanceof TermQuery)) { + allNestedTermQueries = false; + break; + } + } + } + if (allNestedTermQueries) { + clauses.addAll(((BooleanQuery)q).clauses()); + } else { + BooleanClause.Occur occur = operator == OR_OPERATOR ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST; + if (q instanceof BooleanQuery) { + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + clauses.add(newBooleanClause(clause.getQuery(), occur)); + } + } else { + clauses.add(newBooleanClause(q, occur)); + } + } + } + + /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java index 5b4eba8..201c8eb 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java @@ -18,6 +18,7 @@ package org.apache.lucene.queryparser.classic; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockSynonymAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -33,6 +34,7 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; @@ -311,17 +313,6 @@ public class TestQueryParser extends QueryParserTestBase { assertEquals(unexpanded, smart.parse("\"dogs\"")); } - // TODO: fold these into QueryParserTestBase - - /** adds synonym of "dog" for "dogs". */ - static class MockSynonymAnalyzer extends Analyzer { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - MockTokenizer tokenizer = new MockTokenizer(); - return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); - } - } - /** simple synonyms test */ public void testSynonyms() throws Exception { Query expected = new SynonymQuery(new Term("field", "dogs"), new Term("field", "dog")); @@ -483,4 +474,121 @@ public class TestQueryParser extends QueryParserTestBase { qp.parse("a*aaaaaaa"); }); } + + // TODO: Remove this specialization once the flexible standard parser gets multi-word synonym support + @Override + public void testQPA() throws Exception { + assertQueryEquals("term phrase term", qpAnalyzer, "term phrase1 phrase2 term"); + + CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); + setDefaultOperatorAND(cqpc); + assertQueryEquals(cqpc, "field", "term phrase term", "+term +phrase1 +phrase2 +term"); + } + + // TODO: Move to QueryParserTestBase once standard flexible parser gets this capability + public void testMultiWordSynonyms() throws Exception { + QueryParser dumb = new QueryParser("field", new Analyzer1()); + + // A multi-word synonym source will form a synonym query for the same-starting-position tokens + BooleanQuery.Builder multiWordExpandedBqBuilder = new BooleanQuery.Builder(); + Query multiWordSynonymQuery = new SynonymQuery(new Term("field", "guinea"), new Term("field", "cavy")); + multiWordExpandedBqBuilder.add(multiWordSynonymQuery, BooleanClause.Occur.SHOULD); + multiWordExpandedBqBuilder.add(new TermQuery(new Term("field", "pig")), BooleanClause.Occur.SHOULD); + Query multiWordExpandedBq = multiWordExpandedBqBuilder.build(); + assertEquals(multiWordExpandedBq, dumb.parse("guinea pig")); + + // With the phrase operator, a multi-word synonym source will form a multiphrase query. + // When the number of expanded term(s) is different from that of the original term(s), this is not good. + MultiPhraseQuery.Builder multiWordExpandedMpqBuilder = new MultiPhraseQuery.Builder(); + multiWordExpandedMpqBuilder.add(new Term[] { new Term("field", "guinea"), new Term("field", "cavy") }); + multiWordExpandedMpqBuilder.add(new Term("field", "pig")); + Query multiWordExpandedMPQ = multiWordExpandedMpqBuilder.build(); + assertEquals(multiWordExpandedMPQ, dumb.parse("\"guinea pig\"")); + + // custom behavior, the synonyms are expanded, unless you use quote operator + QueryParser smart = new SmartQueryParser(); + assertEquals(multiWordExpandedBq, smart.parse("guinea pig")); + + PhraseQuery.Builder multiWordUnexpandedPqBuilder = new PhraseQuery.Builder(); + multiWordUnexpandedPqBuilder.add(new Term("field", "guinea")); + multiWordUnexpandedPqBuilder.add(new Term("field", "pig")); + Query multiWordUnexpandedPq = multiWordUnexpandedPqBuilder.build(); + assertEquals(multiWordUnexpandedPq, smart.parse("\"guinea pig\"")); + } + + // TODO: Move to QueryParserTestBase once standard flexible parser gets this capability + public void testOperatorsAndMultiWordSynonyms() throws Exception { + Analyzer a = new MockSynonymAnalyzer(); + // Operators should interrupt multiword analysis of adjacent words if they associate + assertQueryEquals("+guinea pig", a, "+guinea pig"); + assertQueryEquals("-guinea pig", a, "-guinea pig"); + assertQueryEquals("!guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea* pig", a, "guinea* pig"); + assertQueryEquals("guinea? pig", a, "guinea? pig"); + assertQueryEquals("guinea~2 pig", a, "guinea~2 pig"); + assertQueryEquals("guinea^2 pig", a, "(guinea)^2.0 pig"); + + assertQueryEquals("guinea +pig", a, "guinea +pig"); + assertQueryEquals("guinea -pig", a, "guinea -pig"); + assertQueryEquals("guinea !pig", a, "guinea -pig"); + assertQueryEquals("guinea pig*", a, "guinea pig*"); + assertQueryEquals("guinea pig?", a, "guinea pig?"); + assertQueryEquals("guinea pig~2", a, "guinea pig~2"); + assertQueryEquals("guinea pig^2", a, "guinea (pig)^2.0"); + + assertQueryEquals("field:guinea pig", a, "guinea pig"); + assertQueryEquals("guinea field:pig", a, "guinea pig"); + + assertQueryEquals("NOT guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea NOT pig", a, "guinea -pig"); + + assertQueryEquals("guinea pig AND dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs AND guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + assertQueryEquals("guinea pig && dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs && guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + + assertQueryEquals("guinea pig OR dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs OR guinea pig", a, "Synonym(dog dogs) guinea pig"); + assertQueryEquals("guinea pig || dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs || guinea pig", a, "Synonym(dog dogs) guinea pig"); + + assertQueryEquals("\"guinea\" pig", a, "guinea pig"); + assertQueryEquals("guinea \"pig\"", a, "guinea pig"); + + assertQueryEquals("(guinea) pig", a, "guinea pig"); + assertQueryEquals("guinea (pig)", a, "guinea pig"); + + assertQueryEquals("/guinea/ pig", a, "/guinea/ pig"); + assertQueryEquals("guinea /pig/", a, "guinea /pig/"); + + // Operators should not interrupt multiword analysis if not don't associate + assertQueryEquals("(guinea pig)", a, "Synonym(cavy guinea) pig"); + assertQueryEquals("+(guinea pig)", a, "+(Synonym(cavy guinea) pig)"); + assertQueryEquals("-(guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("!(guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("NOT (guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("(guinea pig)^2", a, "(Synonym(cavy guinea) pig)^2.0"); + + assertQueryEquals("field:(guinea pig)", a, "Synonym(cavy guinea) pig"); + + assertQueryEquals("+small guinea pig", a, "+small Synonym(cavy guinea) pig"); + assertQueryEquals("-small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("!small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("NOT small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("small* guinea pig", a, "small* Synonym(cavy guinea) pig"); + assertQueryEquals("small? guinea pig", a, "small? Synonym(cavy guinea) pig"); + assertQueryEquals("\"small\" guinea pig", a, "small Synonym(cavy guinea) pig"); + + assertQueryEquals("guinea pig +running", a, "Synonym(cavy guinea) pig +running"); + assertQueryEquals("guinea pig -running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig !running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig NOT running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig running*", a, "Synonym(cavy guinea) pig running*"); + assertQueryEquals("guinea pig running?", a, "Synonym(cavy guinea) pig running?"); + assertQueryEquals("guinea pig \"running\"", a, "Synonym(cavy guinea) pig running"); + + assertQueryEquals("\"guinea pig\"~2", a, "\"(guinea cavy) pig\"~2"); + + assertQueryEquals("field:\"guinea pig\"", a, "\"(guinea cavy) pig\""); + } } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java index cc2ac12..afcbe9b 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java @@ -205,4 +205,15 @@ public class TestStandardQP extends QueryParserTestBase { //TODO test something like "SmartQueryParser()" } + // TODO: Remove this specialization once the flexible standard parser gets multi-word synonym support + @Override + public void testQPA() throws Exception { + super.testQPA(); + + assertQueryEquals("term phrase term", qpAnalyzer, "term (phrase1 phrase2) term"); + + CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); + setDefaultOperatorAND(cqpc); + assertQueryEquals(cqpc, "field", "term phrase term", "+term +(+phrase1 +phrase2) +term"); + } } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index 70dc15a..5f52190 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -535,8 +535,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertQueryEquals("term -(stop) term", qpAnalyzer, "term term"); assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "+drop +roll"); - assertQueryEquals("term phrase term", qpAnalyzer, - "term (phrase1 phrase2) term"); + +// TODO: Re-enable once flexible standard parser gets multi-word synonym support +// assertQueryEquals("term phrase term", qpAnalyzer, +// "term phrase1 phrase2 term"); assertQueryEquals("term AND NOT phrase term", qpAnalyzer, "+term -(phrase1 phrase2) term"); assertQueryEquals("stop^3", qpAnalyzer, ""); @@ -552,8 +554,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); setDefaultOperatorAND(cqpc); - assertQueryEquals(cqpc, "field", "term phrase term", - "+term +(+phrase1 +phrase2) +term"); +// TODO: Re-enable once flexible standard parser gets multi-word synonym support +// assertQueryEquals(cqpc, "field", "term phrase term", +// "+term +phrase1 +phrase2 +term"); assertQueryEquals(cqpc, "field", "phrase", "+phrase1 +phrase2"); } @@ -1101,37 +1104,6 @@ public abstract class QueryParserTestBase extends LuceneTestCase { dir.close(); } - /** - * adds synonym of "dog" for "dogs". - */ - protected static class MockSynonymFilter extends TokenFilter { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - boolean addSynonym = false; - - public MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (addSynonym) { // inject our synonym - clearAttributes(); - termAtt.setEmpty().append("dog"); - posIncAtt.setPositionIncrement(0); - addSynonym = false; - return true; - } - - if (input.incrementToken()) { - addSynonym = termAtt.toString().equals("dogs"); - return true; - } else { - return false; - } - } - } - /** whitespace+lowercase analyzer with synonyms */ protected class Analyzer1 extends Analyzer { public Analyzer1(){ @@ -1251,10 +1223,8 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CharacterRunAutomaton stopStopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton()); - CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); - - qp = getParserConfig( - new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); + CommonQueryParserConfiguration qp + = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); qp.setEnablePositionIncrements(true); PhraseQuery.Builder phraseQuery = new PhraseQuery.Builder(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java new file mode 100644 index 0000000..a2ce33e --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java @@ -0,0 +1,28 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ +public class MockSynonymAnalyzer extends Analyzer { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + MockTokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); + } +} + diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java new file mode 100644 index 0000000..b50be07 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java @@ -0,0 +1,97 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource; + +/** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ +public class MockSynonymFilter extends TokenFilter { + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + List tokenQueue = new ArrayList<>(); + boolean endOfInput = false; + + public MockSynonymFilter(TokenStream input) { + super(input); + } + + @Override + public void reset() throws IOException { + super.reset(); + tokenQueue.clear(); + endOfInput = false; + } + + @Override + public final boolean incrementToken() throws IOException { + if (tokenQueue.size() > 0) { + tokenQueue.remove(0).copyTo(this); + return true; + } + if (endOfInput == false && input.incrementToken()) { + if (termAtt.toString().equals("dogs")) { + addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset()); + } else if (termAtt.toString().equals("guinea")) { + AttributeSource firstSavedToken = cloneAttributes(); + if (input.incrementToken()) { + if (termAtt.toString().equals("pig")) { + AttributeSource secondSavedToken = cloneAttributes(); + int secondEndOffset = offsetAtt.endOffset(); + firstSavedToken.copyTo(this); + addSynonym("cavy", 2, secondEndOffset); + tokenQueue.add(secondSavedToken); + } else if (termAtt.toString().equals("dogs")) { + tokenQueue.add(cloneAttributes()); + addSynonym("dog", 1, offsetAtt.endOffset()); + } + } else { + endOfInput = true; + } + firstSavedToken.copyTo(this); + } + return true; + } else { + endOfInput = true; + return false; + } + } + private void addSynonym(String synonymText, int posLen, int endOffset) { + termAtt.setEmpty().append(synonymText); + posIncAtt.setPositionIncrement(0); + posLenAtt.setPositionLength(posLen); + offsetAtt.setOffset(offsetAtt.startOffset(), endOffset); + tokenQueue.add(cloneAttributes()); + } + private void addSynonymAndRestoreOrigToken(String synonymText, int posLen, int endOffset) { + AttributeSource origToken = cloneAttributes(); + addSynonym(synonymText, posLen, endOffset); + origToken.copyTo(this); + } +} + + diff --git a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java new file mode 100644 index 0000000..fb0d065 --- /dev/null +++ b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis; + +import java.io.IOException; + +/** test the mock synonym filter */ +public class TestMockSynonymFilter extends BaseTokenStreamTestCase { + + /** test the mock synonym filter */ + public void test() throws IOException { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + MockTokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); + } + }; + + assertAnalyzesTo(analyzer, "dogs", + new String[]{"dogs", "dog"}, + new int[]{0, 0}, // start offset + new int[]{4, 4}, // end offset + null, + new int[]{1, 0}, // position increment + new int[]{1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small dogs", + new String[]{"small", "dogs", "dog"}, + new int[]{0, 6, 6}, // start offset + new int[]{5, 10, 10}, // end offset + null, + new int[]{1, 1, 0}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "dogs running", + new String[]{"dogs", "dog", "running"}, + new int[]{0, 0, 5}, // start offset + new int[]{4, 4, 12}, // end offset + null, + new int[]{1, 0, 1}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small dogs running", + new String[]{"small", "dogs", "dog", "running"}, + new int[]{0, 6, 6, 11}, // start offset + new int[]{5, 10, 10, 18}, // end offset + null, + new int[]{1, 1, 0, 1}, // position increment + new int[]{1, 1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea", + new String[]{"guinea"}, + new int[]{0}, // start offset + new int[]{6}, // end offset + null, + new int[]{1}, // position increment + new int[]{1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "pig", + new String[]{"pig"}, + new int[]{0}, // start offset + new int[]{3}, // end offset + null, + new int[]{1}, // position increment + new int[]{1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea pig", + new String[]{"guinea", "cavy", "pig"}, + new int[]{0, 0, 7}, // start offset + new int[]{6, 10, 10}, // end offset + null, + new int[]{1, 0, 1}, // position increment + new int[]{1, 2, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea dogs", + new String[]{"guinea", "dogs", "dog"}, + new int[]{0, 7, 7}, // start offset + new int[]{6, 11, 11}, // end offset + null, + new int[]{1, 1, 0}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "dogs guinea", + new String[]{"dogs", "dog", "guinea"}, + new int[]{0, 0, 5}, // start offset + new int[]{4, 4, 11}, // end offset + null, + new int[]{1, 0, 1}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "dogs guinea pig", + new String[]{"dogs", "dog", "guinea", "cavy", "pig"}, + new int[]{0, 0, 5, 5, 12}, // start offset + new int[]{4, 4, 11, 15, 15}, // end offset + null, + new int[]{1, 0, 1, 0, 1}, // position increment + new int[]{1, 1, 1, 2, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea pig dogs", + new String[]{"guinea", "cavy", "pig", "dogs", "dog"}, + new int[]{0, 0, 7, 11, 11}, // start offset + new int[]{6, 10, 10, 15, 15}, // end offset + null, + new int[]{1, 0, 1, 1, 0}, // position increment + new int[]{1, 2, 1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small dogs and guinea pig running", + new String[]{"small", "dogs", "dog", "and", "guinea", "cavy", "pig", "running"}, + new int[]{0, 6, 6, 11, 15, 15, 22, 26}, // start offset + new int[]{5, 10, 10, 14, 21, 25, 25, 33}, // end offset + null, + new int[]{1, 1, 0, 1, 1, 0, 1, 1}, // position increment + new int[]{1, 1, 1, 1, 1, 2, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small guinea pig and dogs running", + new String[]{"small", "guinea", "cavy", "pig", "and", "dogs", "dog", "running"}, + new int[]{0, 6, 6, 13, 17, 21, 21, 26}, // start offset + new int[]{5, 12, 16, 16, 20, 25, 25, 33}, // end offset + null, + new int[]{1, 1, 0, 1, 1, 1, 0, 1}, // position increment + new int[]{1, 1, 2, 1, 1, 1, 1, 1}, // position length + true); // check that offsets are correct + } +}