Index: solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java (revision 1030029) +++ solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java (working copy) @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.StringReader; +import java.lang.reflect.Field; import java.util.HashMap; import java.util.Map; @@ -26,8 +27,10 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.schema.IndexSchema; import org.apache.solr.search.SolrQueryParser; @@ -51,6 +54,8 @@ public void setUp() throws Exception { super.setUp(); schema = new IndexSchema(solrConfig, getSchemaFile(), null); + clearIndex(); + assertU(commit()); } @Test @@ -105,7 +110,7 @@ } @Test - public void testQueryParsing() throws IOException, ParseException { + public void testQueryParsing() throws Exception { SolrQueryParser parserOne = new SolrQueryParser(schema, "one"); assertTrue(parserOne.getAllowLeadingWildcard()); @@ -115,29 +120,54 @@ // XXX note: this should be false, but for now we return true for any field, // XXX if at least one field uses the reversing assertTrue(parserThree.getAllowLeadingWildcard()); - String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex"; - String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*"; - String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*"; - String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex"; - Query q = parserOne.parse(text); - assertEquals(expectedOne, q.toString()); - q = parserTwo.parse(text); - assertEquals(expectedTwo, q.toString()); - q = parserThree.parse(text); - assertEquals(expectedThree, q.toString()); + + // add some docs + assertU(adoc("id", "1", "one", "one")); + assertU(adoc("id", "2", "two", "two")); + assertU(adoc("id", "3", "three", "three")); + assertU(adoc("id", "4", "one", "four")); + assertU(adoc("id", "5", "two", "five")); + assertU(adoc("id", "6", "three", "si\uD834\uDD1Ex")); + assertU(commit()); + + assertQ("should have matched", + req("+id:1 +one:one"), + "//result[@numFound=1]"); + + assertQ("should have matched", + req("+id:4 +one:f*ur"), + "//result[@numFound=1]"); + + assertQ("should have matched", + req("+id:6 +three:*si\uD834\uDD1Ex"), + "//result[@numFound=1]"); + // test conditional reversal - String condText = "*hree t*ree th*ee thr*e ?hree t?ree th?ee th?*ee " + - "short*token ver*longtoken"; - String expected = "two:\u0001eerh* two:\u0001eer*t two:\u0001ee*ht " + - "two:thr*e " + - "two:\u0001eerh? two:\u0001eer?t " + - "two:th?ee " + - "two:th?*ee " + - "two:short*token " + - "two:\u0001nekotgnol*rev"; - q = parserTwo.parse(condText); - assertEquals(expected, q.toString()); + assertTrue(wasReversed(parserTwo, "*hree")); + assertTrue(wasReversed(parserTwo, "t*ree")); + assertTrue(wasReversed(parserTwo, "th*ee")); + assertFalse(wasReversed(parserTwo, "thr*e")); + assertTrue(wasReversed(parserTwo, "?hree")); + assertTrue(wasReversed(parserTwo, "t?ree")); + assertFalse(wasReversed(parserTwo, "th?ee")); + assertFalse(wasReversed(parserTwo, "th?*ee")); + assertFalse(wasReversed(parserTwo, "short*token")); + assertTrue(wasReversed(parserTwo, "ver*longtoken")); } + + /** fragile assert: depends on our implementation, but cleanest way to check for now */ + private boolean wasReversed(SolrQueryParser qp, String query) throws Exception { + Query q = qp.parse(query); + if (!(q instanceof AutomatonQuery)) + return false; + // this is a hack to get the protected Automaton field in AutomatonQuery, + // may break in later lucene versions - we have no getter... for good reasons. + final Field automatonField = AutomatonQuery.class.getDeclaredField("automaton"); + automatonField.setAccessible(true); + Automaton automaton = (Automaton) automatonField.get(q); + String prefix = SpecialOperations.getCommonPrefix(automaton); + return prefix.length() > 0 && prefix.charAt(0) == '\u0001'; + } @Test public void testFalsePositives() throws Exception { Index: solr/src/java/org/apache/solr/search/SolrQueryParser.java =================================================================== --- solr/src/java/org/apache/solr/search/SolrQueryParser.java (revision 1030029) +++ solr/src/java/org/apache/solr/search/SolrQueryParser.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.analysis.Analyzer; import org.apache.solr.analysis.*; import org.apache.solr.common.SolrException; @@ -202,37 +203,36 @@ String type = schema.getFieldType(field).getTypeName(); ReversedWildcardFilterFactory factory = leadingWildcards.get(type); if (factory != null) { + Term term = new Term(field, termStr); + // fsa representing the query + Automaton automaton = WildcardQuery.toAutomaton(term); + // TODO: we should likely use the automaton to calculate shouldReverse, too. if (factory.shouldReverse(termStr)) { - int len = termStr.length(); - char[] chars = new char[len+1]; - chars[0] = factory.getMarkerChar(); - termStr.getChars(0, len, chars, 1); - ReversedWildcardFilter.reverse(chars, 1, len); - termStr = new String(chars); + automaton = BasicOperations.concatenate(automaton, BasicAutomata.makeChar(factory.getMarkerChar())); + SpecialOperations.reverse(automaton); } else { // reverse wildcardfilter is active: remove false positives - Term term = new Term(field, termStr); - // fsa representing the query - Automaton a = WildcardQuery.toAutomaton(term); // fsa representing false positives (markerChar*) Automaton falsePositives = BasicOperations.concatenate( BasicAutomata.makeChar(factory.getMarkerChar()), BasicAutomata.makeAnyString()); - return new AutomatonQuery(term, BasicOperations.minus(a, falsePositives)) { - // override toString so its completely transparent - @Override - public String toString(String field) { - StringBuilder buffer = new StringBuilder(); - if (!getField().equals(field)) { - buffer.append(getField()); - buffer.append(":"); - } - buffer.append(term.text()); - buffer.append(ToStringUtils.boost(getBoost())); - return buffer.toString(); + // subtract these away + automaton = BasicOperations.minus(automaton, falsePositives); + } + return new AutomatonQuery(term, automaton) { + // override toString so its completely transparent + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!getField().equals(field)) { + buffer.append(getField()); + buffer.append(":"); } - }; - } + buffer.append(term.text()); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } + }; } Query q = super.getWildcardQuery(field, termStr); if (q instanceof WildcardQuery) { Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1030029) +++ lucene/CHANGES.txt (working copy) @@ -113,6 +113,11 @@ * LUCENE-2674: MultiTermQuery.TermCollector.collect now accepts the TermsEnum as well. (Robert Muir, Mike McCandless) +* LUCENE-588: WildcardQuery and QueryParser now allows escaping with + the '\' character. Previously this was impossible (you could not escape */?, + for example). If your code somehow depends on the old behavior, you will + need to change it (e.g. using "\\" to escape '\' itself). (Robert Muir) + Changes in Runtime Behavior * LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit Index: lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 1030029) +++ lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -770,11 +770,11 @@ assertQueryEquals("a:b\\\\c*", a, "a:b\\c*"); - assertQueryEquals("a:b\\-?c", a, "a:b-?c"); - assertQueryEquals("a:b\\+?c", a, "a:b+?c"); - assertQueryEquals("a:b\\:?c", a, "a:b:?c"); + assertQueryEquals("a:b\\-?c", a, "a:b\\-?c"); + assertQueryEquals("a:b\\+?c", a, "a:b\\+?c"); + assertQueryEquals("a:b\\:?c", a, "a:b\\:?c"); - assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); + assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c"); assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); @@ -1062,6 +1062,12 @@ } + public void testEscapedWildcard() throws Exception { + QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockTokenizer.WHITESPACE, false)); + WildcardQuery q = new WildcardQuery(new Term("field", "foo\\?ba?r")); + assertEquals(q, qp.parse("foo\\?ba?r")); + } + public void testRegexps() throws Exception { QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockTokenizer.WHITESPACE, false)); RegexpQuery q = new RegexpQuery(new Term("field", "[a-z][123]")); Index: lucene/src/test/org/apache/lucene/search/TestWildcard.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestWildcard.java (revision 1030903) +++ lucene/src/test/org/apache/lucene/search/TestWildcard.java (working copy) @@ -205,6 +205,38 @@ indexStore.close(); } + /** + * Tests if wildcard escaping works + */ + public void testEscapes() throws Exception { + Directory indexStore = getIndexStore("field", + new String[]{"foo*bar", "foo??bar", "fooCDbar", "fooSOMETHINGbar", "foo\\"}); + IndexSearcher searcher = new IndexSearcher(indexStore, true); + + // without escape: matches foo??bar, fooCDbar, foo*bar, and fooSOMETHINGbar + WildcardQuery unescaped = new WildcardQuery(new Term("field", "foo*bar")); + assertMatches(searcher, unescaped, 4); + + // with escape: only matches foo*bar + WildcardQuery escaped = new WildcardQuery(new Term("field", "foo\\*bar")); + assertMatches(searcher, escaped, 1); + + // without escape: matches foo??bar and fooCDbar + unescaped = new WildcardQuery(new Term("field", "foo??bar")); + assertMatches(searcher, unescaped, 2); + + // with escape: matches foo??bar only + escaped = new WildcardQuery(new Term("field", "foo\\?\\?bar")); + assertMatches(searcher, escaped, 1); + + // check escaping at end: lenient parse yields "foo\" + WildcardQuery atEnd = new WildcardQuery(new Term("field", "foo\\")); + assertMatches(searcher, atEnd, 1); + + searcher.close(); + indexStore.close(); + } + private Directory getIndexStore(String field, String[] contents) throws IOException { Directory indexStore = newDirectory(); Index: lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java =================================================================== --- lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java (revision 1030029) +++ lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java (working copy) @@ -1045,7 +1045,7 @@ String termImage=discardEscapeChar(term.image); if (wildcard) { - q = getWildcardQuery(qfield, termImage); + q = getWildcardQuery(qfield, term.image); } else if (prefix) { q = getPrefixQuery(qfield, discardEscapeChar(term.image.substring Index: lucene/src/java/org/apache/lucene/search/WildcardQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/WildcardQuery.java (revision 1030029) +++ lucene/src/java/org/apache/lucene/search/WildcardQuery.java (working copy) @@ -45,6 +45,9 @@ /** Char equality with support for wildcards */ public static final char WILDCARD_CHAR = '?'; + /** Escape character */ + public static final char WILDCARD_ESCAPE = '\\'; + /** * Constructs a query for terms matching term. */ @@ -56,6 +59,7 @@ * Convert Lucene wildcard syntax into an automaton. * @lucene.internal */ + @SuppressWarnings("fallthrough") public static Automaton toAutomaton(Term wildcardquery) { List automata = new ArrayList(); @@ -63,6 +67,7 @@ for (int i = 0; i < wildcardText.length();) { final int c = wildcardText.codePointAt(i); + int length = Character.charCount(c); switch(c) { case WILDCARD_STRING: automata.add(BasicAutomata.makeAnyString()); @@ -70,10 +75,18 @@ case WILDCARD_CHAR: automata.add(BasicAutomata.makeAnyChar()); break; + case WILDCARD_ESCAPE: + // add the next codepoint instead, if it exists + if (i + length < wildcardText.length()) { + final int nextChar = wildcardText.codePointAt(i + length); + length += Character.charCount(nextChar); + automata.add(BasicAutomata.makeChar(nextChar)); + break; + } // else fallthru, lenient parsing with a trailing \ default: automata.add(BasicAutomata.makeChar(c)); } - i += Character.charCount(c); + i += length; } return BasicOperations.concatenate(automata); Index: lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 1030029) +++ lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (working copy) @@ -178,7 +178,7 @@ * Reverses the language of the given (non-singleton) automaton while returning * the set of new initial states. */ - static Set reverse(Automaton a) { + public static Set reverse(Automaton a) { a.expandSingleton(); // reverse all edges HashMap> m = new HashMap>(); Index: lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestQPHelper.java =================================================================== --- lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestQPHelper.java (revision 1030029) +++ lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestQPHelper.java (working copy) @@ -76,6 +76,7 @@ import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; +import org.junit.Ignore; /** * This test case is a copy of the core Lucene query parser test, it was adapted @@ -945,6 +946,15 @@ assertEscapedQueryEquals("&& abc &&", a, "\\&\\& abc \\&\\&"); } + @Ignore("contrib queryparser shouldn't escape wildcard terms") + public void testEscapedWildcard() throws Exception { + StandardQueryParser qp = new StandardQueryParser(); + qp.setAnalyzer(new MockAnalyzer(MockTokenizer.WHITESPACE, false)); + + WildcardQuery q = new WildcardQuery(new Term("field", "foo\\?ba?r")); + assertEquals(q, qp.parse("foo\\?ba?r", "field")); + } + public void testTabNewlineCarriageReturn() throws Exception { assertQueryEqualsDOA("+weltbank +worlbank", null, "+weltbank +worlbank"); Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/nodes/WildcardQueryNode.java =================================================================== --- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/nodes/WildcardQueryNode.java (revision 1030029) +++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/nodes/WildcardQueryNode.java (working copy) @@ -49,9 +49,9 @@ @Override public CharSequence toQueryString(EscapeQuerySyntax escaper) { if (isDefaultField(this.field)) { - return getTermEscaped(escaper); + return this.text; } else { - return this.field + ":" + getTermEscaped(escaper); + return this.field + ":" + this.text; } }