Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1479472) +++ lucene/CHANGES.txt (working copy) @@ -103,6 +103,9 @@ * LUCENE-4972: DirectoryTaxonomyWriter created empty commits even if no changes were made. (Shai Erera, Michael McCandless) +* LUCENE-949: AnalyzingQueryParser can't work with leading wildcards. + (Tim Allison, Robert Muir, Steve Rowe) + Optimizations * LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java (revision 1479472) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java (working copy) @@ -19,8 +19,8 @@ import java.io.IOException; import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -31,23 +31,18 @@ /** * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys - * are also passed through the given analyzer, but wild card characters (like *) - * don't get removed from the search terms. + * are also passed through the given analyzer, but wildcard characters * and + * ? don't get removed from the search terms. * *

Warning: This class should only be used with analyzers that do not use stopwords * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer * will turn Häuser into hau, but H?user will * become h?user when using this parser and thus no match would be found (i.e. * using this parser will be no improvement over QueryParser in such cases). - * */ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser { - - /** - * Constructs a query parser. - * @param field the default field for query terms. - * @param analyzer used to find terms in the query text. - */ + // gobble escaped chars or find a wildcard character + private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)"); public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) { super(matchVersion, field, analyzer); setAnalyzeRangeTerms(true); @@ -59,8 +54,8 @@ * characters (like *), but is not a prefix term token (one * that has just a single * character at the end). *

- * Example: will be called for H?user or for H*user - * but not for *user. + * Example: will be called for H?user or for H*user. + * *

* Depending on analyzer and settings, a wildcard term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. @@ -68,113 +63,52 @@ * Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. - * @param termStr Term token that contains one or more wild card + * @param termStr Term that contains one or more wildcard * characters (? or *), but is not simple prefix term * * @return Resulting {@link Query} built for the term */ @Override protected Query getWildcardQuery(String field, String termStr) throws ParseException { - List tlist = new ArrayList(); - List wlist = new ArrayList(); - /* somewhat a hack: find/store wildcard chars - * in order to put them back after analyzing */ - boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); - StringBuilder tmpBuffer = new StringBuilder(); - char[] chars = termStr.toCharArray(); - for (int i = 0; i < termStr.length(); i++) { - if (chars[i] == '?' || chars[i] == '*') { - if (isWithinToken) { - tlist.add(tmpBuffer.toString()); - tmpBuffer.setLength(0); - } - isWithinToken = false; - } else { - if (!isWithinToken) { - wlist.add(tmpBuffer.toString()); - tmpBuffer.setLength(0); - } - isWithinToken = true; - } - tmpBuffer.append(chars[i]); + + if (termStr == null){ + //can't imagine this would ever happen + throw new ParseException("Passed null value as term to getWildcardQuery"); } - if (isWithinToken) { - tlist.add(tmpBuffer.toString()); - } else { - wlist.add(tmpBuffer.toString()); + if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) { + throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery" + + " unless getAllowLeadingWildcard() returns true"); } - - // get Analyzer from superclass and tokenize the term - TokenStream source; - int countTokens = 0; - try { - source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - source.reset(); - } catch (IOException e1) { - throw new RuntimeException(e1); - } - CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - while (true) { - try { - if (!source.incrementToken()) break; - } catch (IOException e) { - break; + Matcher wildcardMatcher = wildcardPattern.matcher(termStr); + StringBuilder sb = new StringBuilder(); + int last = 0; + + while (wildcardMatcher.find()){ + // continue if escaped char + if (wildcardMatcher.group(1) != null){ + continue; } - String term = termAtt.toString(); - if (!"".equals(term)) { - try { - tlist.set(countTokens++, term); - } catch (IndexOutOfBoundsException ioobe) { - countTokens = -1; - } + + if (wildcardMatcher.start() > 0){ + String chunk = termStr.substring(last, wildcardMatcher.start()); + String analyzed = analyzeSingleChunk(field, termStr, chunk); + sb.append(analyzed); } + //append the wildcard character + sb.append(wildcardMatcher.group(2)); + + last = wildcardMatcher.end(); } - try { - source.end(); - source.close(); - } catch (IOException e) { - // ignore + if (last < termStr.length()){ + sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last))); } - - if (countTokens != tlist.size()) { - /* this means that the analyzer used either added or consumed - * (common for a stemmer) tokens, and we can't build a WildcardQuery */ - throw new ParseException("Cannot build WildcardQuery with analyzer " - + getAnalyzer().getClass() + " - tokens added or lost"); - } - - if (tlist.size() == 0) { - return null; - } else if (tlist.size() == 1) { - if (wlist != null && wlist.size() == 1) { - /* if wlist contains one wildcard, it must be at the end, because: - * 1) wildcards are not allowed in 1st position of a term by QueryParser - * 2) if wildcard was *not* in end, there would be *two* or more tokens */ - return super.getWildcardQuery(field, tlist.get(0) - + wlist.get(0).toString()); - } else { - /* we should never get here! if so, this method was called - * with a termStr containing no wildcard ... */ - throw new IllegalArgumentException("getWildcardQuery called without wildcard"); - } - } else { - /* the term was tokenized, let's rebuild to one token - * with wildcards put back in postion */ - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < tlist.size(); i++) { - sb.append( tlist.get(i)); - if (wlist != null && wlist.size() > i) { - sb.append(wlist.get(i)); - } - } - return super.getWildcardQuery(field, sb.toString()); - } + return super.getWildcardQuery(field, sb.toString()); } - + /** * Called when parser parses an input term - * token that uses prefix notation; that is, contains a single '*' wildcard + * that uses prefix notation; that is, contains a single '*' wildcard * character as its last character. Since this is a special case * of generic wildcard term, and such a query can be optimized easily, * this usually results in a different query object. @@ -185,48 +119,15 @@ * Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. - * @param termStr Term token to use for building term for the query + * @param termStr Term to use for building term for the query * (without trailing '*' character!) * * @return Resulting {@link Query} built for the term */ @Override protected Query getPrefixQuery(String field, String termStr) throws ParseException { - // get Analyzer from superclass and tokenize the term - TokenStream source; - List tlist = new ArrayList(); - try { - source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - source.reset(); - } catch (IOException e1) { - throw new RuntimeException(e1); - } - CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - while (true) { - try { - if (!source.incrementToken()) break; - } catch (IOException e) { - break; - } - tlist.add(termAtt.toString()); - } - - try { - source.end(); - source.close(); - } catch (IOException e) { - // ignore - } - - if (tlist.size() == 1) { - return super.getPrefixQuery(field, tlist.get(0)); - } else { - /* this means that the analyzer used either added or consumed - * (common for a stemmer) tokens, and we can't build a PrefixQuery */ - throw new ParseException("Cannot build PrefixQuery with analyzer " - + getAnalyzer().getClass() - + (tlist.size() > 1 ? " - token(s) added" : " - token consumed")); - } + String analyzed = analyzeSingleChunk(field, termStr, termStr); + return super.getPrefixQuery(field, analyzed); } /** @@ -245,35 +146,66 @@ @Override protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { - // get Analyzer from superclass and tokenize the term - TokenStream source = null; - String nextToken = null; - boolean multipleTokens = false; - - try { - source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - source.reset(); - if (source.incrementToken()) { - nextToken = termAtt.toString(); + + String analyzed = analyzeSingleChunk(field, termStr, termStr); + return super.getFuzzyQuery(field, analyzed, minSimilarity); + } + + /** + * Returns the analyzed form for the given chunk + * + * If the analyzer produces more than one output token from the given chunk, + * a ParseException is thrown. + * + * @param field The target field + * @param termStr The full term from which the given chunk is excerpted + * @param chunk The portion of the given termStr to be analyzed + * @return The result of analyzing the given chunk + * @throws ParseException when analysis returns other than one output token + */ + protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{ + String analyzed = null; + TokenStream stream = null; + try{ + stream = getAnalyzer().tokenStream(field, new StringReader(chunk)); + stream.reset(); + CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); + // get first and hopefully only output token + if (stream.incrementToken()) { + analyzed = termAtt.toString(); + + // try to increment again, there should only be one output token + StringBuilder multipleOutputs = null; + while (stream.incrementToken()) { + if (null == multipleOutputs) { + multipleOutputs = new StringBuilder(); + multipleOutputs.append('"'); + multipleOutputs.append(analyzed); + multipleOutputs.append('"'); + } + multipleOutputs.append(','); + multipleOutputs.append('"'); + multipleOutputs.append(termAtt.toString()); + multipleOutputs.append('"'); + } + stream.end(); + stream.close(); + if (null != multipleOutputs) { + throw new ParseException( + String.format(getLocale(), + "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString())); + } + } else { + // nothing returned by analyzer. Was it a stop word and the user accidentally + // used an analyzer with stop words? + stream.end(); + stream.close(); + throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk)); } - multipleTokens = source.incrementToken(); - } catch (IOException e) { - nextToken = null; + } catch (IOException e){ + throw new ParseException( + String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr)); } - - try { - source.end(); - source.close(); - } catch (IOException e) { - // ignore - } - - if (multipleTokens) { - throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass() - + " - tokens were added"); - } - - return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity); + return analyzed; } } Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java (revision 1479472) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java (working copy) @@ -19,8 +19,17 @@ import java.io.IOException; import java.io.Reader; +import java.util.Map; +import java.util.TreeMap; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockBytesAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -33,11 +42,14 @@ import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; /** */ +@SuppressCodecs("Lucene3x") // binary terms public class TestAnalyzingQueryParser extends LuceneTestCase { - + private final static String FIELD = "field"; + private Analyzer a; private String[] wildcardInput; @@ -49,12 +61,15 @@ private String[] fuzzyInput; private String[] fuzzyExpected; + private Map wildcardEscapeHits = new TreeMap(); + private Map wildcardEscapeMisses = new TreeMap(); + @Override public void setUp() throws Exception { super.setUp(); - wildcardInput = new String[] { "übersetzung über*ung", + wildcardInput = new String[] { "*bersetzung über*ung", "Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" }; - wildcardExpected = new String[] { "ubersetzung uber*ung", "motley crue motl?* cru?", + wildcardExpected = new String[] { "*bersetzung uber*ung", "motley crue motl?* cru?", "renee zellweger ren?? zellw?ger" }; prefixInput = new String[] { "übersetzung übersetz*", @@ -71,43 +86,138 @@ fuzzyExpected = new String[] { "ubersetzung ubersetzung~1", "motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" }; + wildcardEscapeHits.put("mö*tley", "moatley"); + + // need to have at least one genuine wildcard to trigger the wildcard analysis + // hence the * before the y + wildcardEscapeHits.put("mö\\*tl*y", "mo*tley"); + + // escaped backslash then true wildcard + wildcardEscapeHits.put("mö\\\\*tley", "mo\\atley"); + + // escaped wildcard then true wildcard + wildcardEscapeHits.put("mö\\??ley", "mo?tley"); + + // the first is an escaped * which should yield a miss + wildcardEscapeMisses.put("mö\\*tl*y", "moatley"); + a = new ASCIIAnalyzer(); } + public void testSingleChunkExceptions() { + boolean ex = false; + String termStr = "the*tre"; + + Analyzer stopsAnalyzer = new MockAnalyzer + (random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); + try { + String q = parseWithAnalyzingQueryParser(termStr, stopsAnalyzer, true); + } catch (ParseException e){ + if (e.getMessage().contains("returned nothing")){ + ex = true; + } + } + assertEquals("Should have returned nothing", true, ex); + ex = false; + + AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, a); + try{ + qp.analyzeSingleChunk(FIELD, "", "not a single chunk"); + } catch (ParseException e){ + if (e.getMessage().contains("multiple terms")){ + ex = true; + } + } + assertEquals("Should have produced multiple terms", true, ex); + } + + public void testWildcardAlone() throws ParseException { + //seems like crazy edge case, but can be useful in concordance + boolean pex = false; + try{ + Query q = getAnalyzedQuery("*", a, false); + } catch (ParseException e){ + pex = true; + } + assertEquals("Wildcard alone with allowWildcard=false", true, pex); + + pex = false; + try { + String qString = parseWithAnalyzingQueryParser("*", a, true); + assertEquals("Every word", "*", qString); + } catch (ParseException e){ + pex = true; + } + + assertEquals("Wildcard alone with allowWildcard=true", false, pex); + + } + public void testWildCardEscapes() throws ParseException, IOException { + + for (Map.Entry entry : wildcardEscapeHits.entrySet()){ + Query q = getAnalyzedQuery(entry.getKey(), a, false); + assertEquals("WildcardEscapeHits: " + entry.getKey(), true, isAHit(q, entry.getValue(), a)); + } + for (Map.Entry entry : wildcardEscapeMisses.entrySet()){ + Query q = getAnalyzedQuery(entry.getKey(), a, false); + assertEquals("WildcardEscapeMisses: " + entry.getKey(), false, isAHit(q, entry.getValue(), a)); + } + + } + public void testWildCardQueryNoLeadingAllowed() { + boolean ex = false; + try{ + String q = parseWithAnalyzingQueryParser(wildcardInput[0], a, false); + + } catch (ParseException e){ + ex = true; + } + assertEquals("Testing initial wildcard not allowed", + true, ex); + } + public void testWildCardQuery() throws ParseException { for (int i = 0; i < wildcardInput.length; i++) { assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: " - + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a)); + + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a, true)); } } + public void testPrefixQuery() throws ParseException { for (int i = 0; i < prefixInput.length; i++) { assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: " - + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a)); + + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a, false)); } } public void testRangeQuery() throws ParseException { for (int i = 0; i < rangeInput.length; i++) { assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: " - + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a)); + + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a, false)); } } public void testFuzzyQuery() throws ParseException { for (int i = 0; i < fuzzyInput.length; i++) { assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: " - + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a)); + + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a, false)); } } - private String parseWithAnalyzingQueryParser(String s, Analyzer a) throws ParseException { - AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "field", a); + + private String parseWithAnalyzingQueryParser(String s, Analyzer a, boolean allowLeadingWildcard) throws ParseException { + Query q = getAnalyzedQuery(s, a, allowLeadingWildcard); + return q.toString(FIELD); + } + + private Query getAnalyzedQuery(String s, Analyzer a, boolean allowLeadingWildcard) throws ParseException { + AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, a); + qp.setAllowLeadingWildcard(allowLeadingWildcard); org.apache.lucene.search.Query q = qp.parse(s); - return q.toString("field"); + return q; } - + final static class FoldingFilter extends TokenFilter { final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); @@ -144,31 +254,45 @@ final static class ASCIIAnalyzer extends Analyzer { @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); return new TokenStreamComponents(result, new FoldingFilter(result)); } } - + + // LUCENE-4176 public void testByteTerms() throws Exception { + String s = "เข"; + Analyzer analyzer = new MockBytesAnalyzer(); + QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, analyzer); + Query q = qp.parse("[เข TO เข]"); + assertEquals(true, isAHit(q, s, analyzer)); + } + + + private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOException{ Directory ramDir = newDirectory(); - Analyzer analyzer = new MockBytesAnalyzer(); RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer); Document doc = new Document(); FieldType fieldType = new FieldType(); fieldType.setIndexed(true); fieldType.setTokenized(true); fieldType.setStored(true); - Field field = new Field("content","เข", fieldType); + Field field = new Field(FIELD, content, fieldType); doc.add(field); writer.addDocument(doc); writer.close(); DirectoryReader ir = DirectoryReader.open(ramDir); - IndexSearcher is = newSearcher(ir); - QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "content", analyzer); - Query q = qp.parse("[เข TO เข]"); - assertEquals(1, is.search(q, 10).totalHits); + IndexSearcher is = new IndexSearcher(ir); + + int hits = is.search(q, 10).totalHits; ir.close(); ramDir.close(); + if (hits == 1){ + return true; + } else { + return false; + } + } } \ No newline at end of file