Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java (revision 1475838) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java (working copy) @@ -19,156 +19,273 @@ import java.io.IOException; import java.io.Reader; +import java.util.Map; +import java.util.TreeMap; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; /** */ +@SuppressCodecs("Lucene3x") // binary terms public class TestAnalyzingQueryParser extends LuceneTestCase { + private final static String FIELD = "field"; + + private Analyzer a; - private Analyzer a; + private String[] wildcardInput; + private String[] wildcardExpected; + private String[] prefixInput; + private String[] prefixExpected; + private String[] rangeInput; + private String[] rangeExpected; + private String[] fuzzyInput; + private String[] fuzzyExpected; - private String[] wildcardInput; - private String[] wildcardExpected; - private String[] prefixInput; - private String[] prefixExpected; - private String[] rangeInput; - private String[] rangeExpected; - private String[] fuzzyInput; - private String[] fuzzyExpected; + private Map wildcardEscapeHits = new TreeMap(); + private Map wildcardEscapeMisses = new TreeMap(); - @Override - public void setUp() throws Exception { - super.setUp(); - wildcardInput = new String[] { "übersetzung über*ung", - "Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" }; - wildcardExpected = new String[] { "ubersetzung uber*ung", "motley crue motl?* cru?", - "renee zellweger ren?? zellw?ger" }; + @Override + public void setUp() throws Exception { + super.setUp(); + wildcardInput = new String[] { "*bersetzung über*ung", + "Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" }; + wildcardExpected = new String[] { "*bersetzung uber*ung", "motley crue motl?* cru?", + "renee zellweger ren?? zellw?ger" }; - prefixInput = new String[] { "übersetzung übersetz*", - "Mötley Crüe Mötl* crü*", "René? Zellw*" }; - prefixExpected = new String[] { "ubersetzung ubersetz*", "motley crue motl* cru*", - "rene? zellw*" }; + prefixInput = new String[] { "übersetzung übersetz*", + "Mötley Crüe Mötl* crü*", "René? Zellw*" }; + prefixExpected = new String[] { "ubersetzung ubersetz*", "motley crue motl* cru*", + "rene? zellw*" }; - rangeInput = new String[] { "[aa TO bb]", "{Anaïs TO Zoé}" }; - rangeExpected = new String[] { "[aa TO bb]", "{anais TO zoe}" }; + rangeInput = new String[] { "[aa TO bb]", "{Anaïs TO Zoé}" }; + rangeExpected = new String[] { "[aa TO bb]", "{anais TO zoe}" }; - fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9", - "Mötley Crüe Mötley~0.75 Crüe~0.5", - "Renée Zellweger Renée~0.9 Zellweger~" }; - fuzzyExpected = new String[] { "ubersetzung ubersetzung~1", - "motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" }; + fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9", + "Mötley Crüe Mötley~0.75 Crüe~0.5", + "Renée Zellweger Renée~0.9 Zellweger~" }; + fuzzyExpected = new String[] { "ubersetzung ubersetzung~1", + "motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" }; - a = new ASCIIAnalyzer(); - } + wildcardEscapeHits.put("mö*tley", "moatley"); - public void testWildCardQuery() throws ParseException { - for (int i = 0; i < wildcardInput.length; i++) { - assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: " - + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a)); - } - } + //need to have at least one genuine wildcard to trigger the wildcard analysis + //hence the * before the y + wildcardEscapeHits.put("mö\\*tl*y", "mo*tley"); - public void testPrefixQuery() throws ParseException { - for (int i = 0; i < prefixInput.length; i++) { - assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: " - + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a)); - } - } + //escaped backslash then true wildcard + wildcardEscapeHits.put("mö\\\\*tley", "mo\\atley"); - public void testRangeQuery() throws ParseException { - for (int i = 0; i < rangeInput.length; i++) { - assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: " - + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a)); - } - } + //the first is an escaped * which should yield a miss + wildcardEscapeMisses.put("mö\\*tl*y", "moatley"); + + a = new ASCIIAnalyzer(); + } - public void testFuzzyQuery() throws ParseException { - for (int i = 0; i < fuzzyInput.length; i++) { - assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: " - + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a)); - } - } + public void testSingleChunkExceptions() { + boolean ex = false; + String chunk = "the*tre"; + + //redundant stop initialization to remind that this has stops + Analyzer stopsAnalyzer = new StandardAnalyzer(TEST_VERSION_CURRENT, StandardAnalyzer.STOP_WORDS_SET); + + try{ + String q = parseWithAnalyzingQueryParser(chunk, stopsAnalyzer, true); + } catch (ParseException e){ + if (e.getMessage().contains("returned nothing")){ + ex = true; + } + } + assertEquals("Chunklet dropped", true, ex); + ex = false; + + AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, a); + try{ + qp.analyzeSingleChunk(FIELD, "", "not a single chunk"); + } catch (ParseException e){ + if (e.getMessage().contains("multiple terms")){ + ex = true; + } + } + assertEquals("Chunklet split", true, ex); + } + + public void testWildcardAlone() throws ParseException { + //seems like crazy edge case, but can be useful in concordance + boolean pex = false; + try{ + Query q = getAnalyzedQuery("*", a, false); + } catch (ParseException e){ + pex = true; + } + assertEquals("Wildcard alone with allowWildcard=false", true, pex); + + pex = false; + try { + String qString = parseWithAnalyzingQueryParser("*", a, true); + assertEquals("Every word", "*", qString); + } catch (ParseException e){ + pex = true; + } + + assertEquals("Wildcard alone with allowWildcard=true", false, pex); - private String parseWithAnalyzingQueryParser(String s, Analyzer a) throws ParseException { - AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "field", a); - org.apache.lucene.search.Query q = qp.parse(s); - return q.toString("field"); - } - - final static class FoldingFilter extends TokenFilter { - final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + } + public void testWildCardEscapes() throws ParseException, IOException { - public FoldingFilter(TokenStream input) { - super(input); - } + for (Map.Entry entry : wildcardEscapeHits.entrySet()){ + Query q = getAnalyzedQuery(entry.getKey(), a, false); + assertEquals("WildcardEscapeHits: " + entry.getKey(), true, isAHit(q, entry.getValue(), a)); + } + for (Map.Entry entry : wildcardEscapeMisses.entrySet()){ + Query q = getAnalyzedQuery(entry.getKey(), a, false); + assertEquals("WildcardEscapeMisses: " + entry.getKey(), false, isAHit(q, entry.getValue(), a)); + } - @Override - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - char term[] = termAtt.buffer(); - for (int i = 0; i < term.length; i++) - switch(term[i]) { - case 'ü': - term[i] = 'u'; - break; - case 'ö': - term[i] = 'o'; - break; - case 'é': - term[i] = 'e'; - break; - case 'ï': - term[i] = 'i'; - break; - } - return true; + } + public void testWildCardQueryNoLeadingAllowed() { + boolean ex = false; + try{ + String q = parseWithAnalyzingQueryParser(wildcardInput[0], a, false); + + } catch (ParseException e){ + ex = true; + } + assertEquals("Testing initial wildcard not allowed", + true, ex); + } + + public void testWildCardQuery() throws ParseException { + for (int i = 0; i < wildcardInput.length; i++) { + assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: " + + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a, true)); + } + } + + + public void testPrefixQuery() throws ParseException { + for (int i = 0; i < prefixInput.length; i++) { + assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: " + + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a, false)); + } + } + + public void testRangeQuery() throws ParseException { + for (int i = 0; i < rangeInput.length; i++) { + assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: " + + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a, false)); + } + } + + public void testFuzzyQuery() throws ParseException { + for (int i = 0; i < fuzzyInput.length; i++) { + assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: " + + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a, false)); + } + } + + + private String parseWithAnalyzingQueryParser(String s, Analyzer a, boolean allowLeadingWildcard) throws ParseException { + Query q = getAnalyzedQuery(s, a, allowLeadingWildcard); + return q.toString(FIELD); + } + + private Query getAnalyzedQuery(String s, Analyzer a, boolean allowLeadingWildcard) throws ParseException { + AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, a); + qp.setAllowLeadingWildcard(allowLeadingWildcard); + org.apache.lucene.search.Query q = qp.parse(s); + return q; + } + + final static class FoldingFilter extends TokenFilter { + final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public FoldingFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char term[] = termAtt.buffer(); + for (int i = 0; i < term.length; i++) + switch(term[i]) { + case 'ü': + term[i] = 'u'; + break; + case 'ö': + term[i] = 'o'; + break; + case 'é': + term[i] = 'e'; + break; + case 'ï': + term[i] = 'i'; + break; + } + return true; + } else { + return false; + } + } + } + + final static class ASCIIAnalyzer extends Analyzer { + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(result, new FoldingFilter(result)); + } + } + + + // LUCENE-4176 + public void testByteTerms() throws Exception { + String s = "เข"; + Analyzer analyzer = new MockBytesAnalyzer(); + QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, analyzer); + Query q = qp.parse("[เข TO เข]"); + assertEquals(true, isAHit(q, s, analyzer)); + } + + + private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOException{ + Directory ramDir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer); + Document doc = new Document(); + FieldType fieldType = new FieldType(); + fieldType.setIndexed(true); + fieldType.setTokenized(true); + fieldType.setStored(true); + Field field = new Field(FIELD, content, fieldType); + doc.add(field); + writer.addDocument(doc); + writer.close(); + DirectoryReader ir = DirectoryReader.open(ramDir); + IndexSearcher is = new IndexSearcher(ir); + + int hits = is.search(q, 10).totalHits; + ir.close(); + ramDir.close(); + if (hits == 1){ + return true; } else { - return false; + return false; } - } - } - final static class ASCIIAnalyzer extends Analyzer { - @Override - public TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - return new TokenStreamComponents(result, new FoldingFilter(result)); - } - } - - // LUCENE-4176 - public void testByteTerms() throws Exception { - Directory ramDir = newDirectory(); - Analyzer analyzer = new MockBytesAnalyzer(); - RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer); - Document doc = new Document(); - FieldType fieldType = new FieldType(); - fieldType.setIndexed(true); - fieldType.setTokenized(true); - fieldType.setStored(true); - Field field = new Field("content","เข", fieldType); - doc.add(field); - writer.addDocument(doc); - writer.close(); - DirectoryReader ir = DirectoryReader.open(ramDir); - IndexSearcher is = newSearcher(ir); - QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "content", analyzer); - Query q = qp.parse("[เข TO เข]"); - assertEquals(1, is.search(q, 10).totalHits); - ir.close(); - ramDir.close(); - } + } } \ No newline at end of file Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java (revision 1475838) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java (working copy) @@ -1,279 +1,194 @@ package org.apache.lucene.queryparser.analyzing; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.Query; -import org.apache.lucene.util.Version; -/** - * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys - * are also passed through the given analyzer, but wild card characters (like *) - * don't get removed from the search terms. - * - *

Warning: This class should only be used with analyzers that do not use stopwords - * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer - * will turn Häuser into hau, but H?user will - * become h?user when using this parser and thus no match would be found (i.e. - * using this parser will be no improvement over QueryParser in such cases). - * - */ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser { + private final static String ESCAPE = "\\"; - /** - * Constructs a query parser. - * @param field the default field for query terms. - * @param analyzer used to find terms in the query text. - */ - public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) { - super(matchVersion, field, analyzer); - setAnalyzeRangeTerms(true); - } + //gobble escaped backslashes or find a wildcard character (maybe with an escape before it?) + private final Pattern wildcardPattern = Pattern.compile("(\\\\)|("+ESCAPE+ESCAPE+"?[?*]+)"); + public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) { + super(matchVersion, field, analyzer); + setAnalyzeRangeTerms(true); + } - /** - * Called when parser - * parses an input term token that contains one or more wildcard - * characters (like *), but is not a prefix term token (one - * that has just a single * character at the end). - *

- * Example: will be called for H?user or for H*user - * but not for *user. - *

- * Depending on analyzer and settings, a wildcard term may (most probably will) - * be lower-cased automatically. It will go through the default Analyzer. - *

- * Overrides super class, by passing terms through analyzer. - * - * @param field Name of the field query will use. - * @param termStr Term token that contains one or more wild card - * characters (? or *), but is not simple prefix term - * - * @return Resulting {@link Query} built for the term - */ - @Override - protected Query getWildcardQuery(String field, String termStr) throws ParseException { - List tlist = new ArrayList(); - List wlist = new ArrayList(); - /* somewhat a hack: find/store wildcard chars - * in order to put them back after analyzing */ - boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); - StringBuilder tmpBuffer = new StringBuilder(); - char[] chars = termStr.toCharArray(); - for (int i = 0; i < termStr.length(); i++) { - if (chars[i] == '?' || chars[i] == '*') { - if (isWithinToken) { - tlist.add(tmpBuffer.toString()); - tmpBuffer.setLength(0); - } - isWithinToken = false; - } else { - if (!isWithinToken) { - wlist.add(tmpBuffer.toString()); - tmpBuffer.setLength(0); - } - isWithinToken = true; - } - tmpBuffer.append(chars[i]); - } - if (isWithinToken) { - tlist.add(tmpBuffer.toString()); - } else { - wlist.add(tmpBuffer.toString()); - } + /** + * Called when parser + * parses an input term token that contains one or more wildcard + * characters (like *), but is not a prefix term token (one + * that has just a single * character at the end). + *

+ * Example: will be called for H?user or for H*user + * + *

+ * Depending on analyzer and settings, a wildcard term may (most probably will) + * be lower-cased automatically. It will go through the default Analyzer. + *

+ * Overrides super class, by passing terms through analyzer. + * + * @param field Name of the field query will use. + * @param termStr Term token that contains one or more wild card + * characters (? or *), but is not simple prefix term + * + * @return Resulting {@link Query} built for the term + */ + @Override + protected Query getWildcardQuery(String field, String termStr) throws ParseException { - // get Analyzer from superclass and tokenize the term - TokenStream source; - - int countTokens = 0; - try { - source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - source.reset(); - } catch (IOException e1) { - throw new RuntimeException(e1); - } - CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - while (true) { - try { - if (!source.incrementToken()) break; - } catch (IOException e) { - break; - } - String term = termAtt.toString(); - if (!"".equals(term)) { - try { - tlist.set(countTokens++, term); - } catch (IndexOutOfBoundsException ioobe) { - countTokens = -1; - } - } - } - try { - source.end(); - source.close(); - } catch (IOException e) { - // ignore - } + if (termStr == null){ + //can't imagine this would ever happen + throw new ParseException("Passed null value as term to getWildcardQuery"); + } + if (getAllowLeadingWildcard() == false && + (termStr.startsWith("*") || termStr.startsWith("?"))){ + throw new ParseException( + "'*' or '?' not allowed as first character in WildcardQuery"); + } + + Matcher wildcardMatcher = wildcardPattern.matcher(termStr); + StringBuilder sb = new StringBuilder(); + int last = 0; + + while (wildcardMatcher.find()){ + //if escaped backslash or an escaped wildcard character, continue + if (wildcardMatcher.group(1) != null || wildcardMatcher.group(2).startsWith(ESCAPE)){ + continue; + } + + if (wildcardMatcher.start() > 0){ + String chunk = termStr.substring(last, wildcardMatcher.start()); + String analyzed = analyzeSingleChunk(field, termStr, chunk); + sb.append(analyzed); + } + //append the wildcard character + sb.append(wildcardMatcher.group(2)); + + last = wildcardMatcher.end(); + } + if (last < termStr.length()){ + sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last))); + } + return super.getWildcardQuery(field, sb.toString()); + } + + /** + * Called when parser parses an input term + * token that uses prefix notation; that is, contains a single '*' wildcard + * character as its last character. Since this is a special case + * of generic wildcard term, and such a query can be optimized easily, + * this usually results in a different query object. + *

+ * Depending on analyzer and settings, a prefix term may (most probably will) + * be lower-cased automatically. It will go through the default Analyzer. + *

+ * Overrides super class, by passing terms through analyzer. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * (without trailing '*' character!) + * + * @return Resulting {@link Query} built for the term + */ + @Override + protected Query getPrefixQuery(String field, String termStr) throws ParseException { - if (countTokens != tlist.size()) { - /* this means that the analyzer used either added or consumed - * (common for a stemmer) tokens, and we can't build a WildcardQuery */ - throw new ParseException("Cannot build WildcardQuery with analyzer " - + getAnalyzer().getClass() + " - tokens added or lost"); - } + String analyzed = analyzeSingleChunk(field, termStr, termStr); + return super.getPrefixQuery(field, analyzed); + } - if (tlist.size() == 0) { - return null; - } else if (tlist.size() == 1) { - if (wlist != null && wlist.size() == 1) { - /* if wlist contains one wildcard, it must be at the end, because: - * 1) wildcards are not allowed in 1st position of a term by QueryParser - * 2) if wildcard was *not* in end, there would be *two* or more tokens */ - return super.getWildcardQuery(field, tlist.get(0) - + wlist.get(0).toString()); - } else { - /* we should never get here! if so, this method was called - * with a termStr containing no wildcard ... */ - throw new IllegalArgumentException("getWildcardQuery called without wildcard"); - } - } else { - /* the term was tokenized, let's rebuild to one token - * with wildcards put back in postion */ - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < tlist.size(); i++) { - sb.append( tlist.get(i)); - if (wlist != null && wlist.size() > i) { - sb.append(wlist.get(i)); - } - } - return super.getWildcardQuery(field, sb.toString()); - } - } + /** + * Called when parser parses an input term token that has the fuzzy suffix (~) appended. + *

+ * Depending on analyzer and settings, a fuzzy term may (most probably will) + * be lower-cased automatically. It will go through the default Analyzer. + *

+ * Overrides super class, by passing terms through analyzer. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * + * @return Resulting {@link Query} built for the term + */ + @Override + protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) + throws ParseException { + + String analyzed = analyzeSingleChunk(field, termStr, termStr); + return super.getFuzzyQuery(field, analyzed, minSimilarity); - /** - * Called when parser parses an input term - * token that uses prefix notation; that is, contains a single '*' wildcard - * character as its last character. Since this is a special case - * of generic wildcard term, and such a query can be optimized easily, - * this usually results in a different query object. - *

- * Depending on analyzer and settings, a prefix term may (most probably will) - * be lower-cased automatically. It will go through the default Analyzer. - *

- * Overrides super class, by passing terms through analyzer. - * - * @param field Name of the field query will use. - * @param termStr Term token to use for building term for the query - * (without trailing '*' character!) - * - * @return Resulting {@link Query} built for the term - */ - @Override - protected Query getPrefixQuery(String field, String termStr) throws ParseException { - // get Analyzer from superclass and tokenize the term - TokenStream source; - List tlist = new ArrayList(); - try { - source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - source.reset(); - } catch (IOException e1) { - throw new RuntimeException(e1); - } - CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - while (true) { - try { - if (!source.incrementToken()) break; - } catch (IOException e) { - break; + } + + + protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{ + //This assumes that the chunk will not be split by the analyzer. + //If the chunk is split, a ParseException is thrown + String analyzed = null; + String extraChunk = null; + TokenStream stream = null; + int chunkletCount = 0; + boolean incremented = false; + try{ + stream = getAnalyzer().tokenStream(field, new StringReader(chunk)); + stream.reset(); + CharTermAttribute termAtt = stream.getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + //get first and only chunklet? + incremented = stream.incrementToken(); + if (incremented){ + analyzed = termAtt.toString(); + chunkletCount = 1; + } else { + //nothing returned by analyzer. Was it a stop word and the user accidentally + //used an analyzer with stop words? + //chunkletCount will be 0 + } + if (incremented){ + //try to increment again, this should have only been one chunk! + incremented = stream.incrementToken(); + if (incremented){ + extraChunk = termAtt.toString(); + chunkletCount = 2; + //finish off stream + while (stream.incrementToken()){} + } + } + + stream.end(); + } catch (IOException e){ + throw new ParseException( + String.format(super.getLocale(), "IO error while trying to analyze single term: %s", + termStr)); + } finally { + if (stream != null){ + try { + stream.close(); + } catch (IOException e){ + throw new ParseException(String.format(super.getLocale(), "IO error while trying to close stream after analyzing: %s", + termStr)); + } + } } - tlist.add(termAtt.toString()); - } - - try { - source.end(); - source.close(); - } catch (IOException e) { - // ignore - } - - if (tlist.size() == 1) { - return super.getPrefixQuery(field, tlist.get(0)); - } else { - /* this means that the analyzer used either added or consumed - * (common for a stemmer) tokens, and we can't build a PrefixQuery */ - throw new ParseException("Cannot build PrefixQuery with analyzer " - + getAnalyzer().getClass() - + (tlist.size() > 1 ? " - token(s) added" : " - token consumed")); - } - } - - /** - * Called when parser parses an input term token that has the fuzzy suffix (~) appended. - *

- * Depending on analyzer and settings, a fuzzy term may (most probably will) - * be lower-cased automatically. It will go through the default Analyzer. - *

- * Overrides super class, by passing terms through analyzer. - * - * @param field Name of the field query will use. - * @param termStr Term token to use for building term for the query - * - * @return Resulting {@link Query} built for the term - */ - @Override - protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) - throws ParseException { - // get Analyzer from superclass and tokenize the term - TokenStream source = null; - String nextToken = null; - boolean multipleTokens = false; - - try { - source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - source.reset(); - if (source.incrementToken()) { - nextToken = termAtt.toString(); + if (chunkletCount == 0){ + throw new ParseException(String.format(super.getLocale(), + "Analyzer returned nothing for %s", chunk)); + } else if (chunkletCount > 1){ + throw new ParseException( + String.format(super.getLocale(), + "Analyzer created multiple terms for what should be a single term: \"%s\" and \"%s\"", + analyzed,extraChunk)); } - multipleTokens = source.incrementToken(); - } catch (IOException e) { - nextToken = null; - } - try { - source.end(); - source.close(); - } catch (IOException e) { - // ignore - } - - if (multipleTokens) { - throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass() - + " - tokens were added"); - } - - return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity); - } + return analyzed; + } } Index: lucene/queryparser/build.xml =================================================================== --- lucene/queryparser/build.xml (revision 1475838) +++ lucene/queryparser/build.xml (working copy) @@ -25,6 +25,8 @@ + +