Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/span/AnalyzingSpanQueryParserTest.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/span/AnalyzingSpanQueryParserTest.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/span/AnalyzingSpanQueryParserTest.java (revision 0) @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span; + +import java.io.IOException; +import java.io.Reader; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.span.NormMultiTerm; +import org.apache.lucene.queryparser.span.SpanQueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/** + * Adopted/Plagiarized largely from SpanQueryParser tests. + * + */ +public class AnalyzingSpanQueryParserTest extends LuceneTestCase { + + private static final String O_UMLAUT_S = "\u00F6"; + private static final char O_UMLAUT_C = '\u00F6'; + private static final String FIELD = "field"; + + private Analyzer a; + private Map wildcardEscapeHits = new TreeMap(); + private Map wildcardEscapeMisses = new TreeMap(); + + @Override + public void setUp() throws Exception { + super.setUp(); + + wildcardEscapeHits.put("m" + O_UMLAUT_S + "tley", "motley"); + + wildcardEscapeHits.put("m" + O_UMLAUT_S + "*tley", "moatley"); + + // need to have at least one genuine wildcard to trigger the wildcard + // analysis + // hence the * before the y + wildcardEscapeHits.put("m" + O_UMLAUT_S + "\\*tl*y", "mo*tley"); + + // escaped backslash then true wildcard + wildcardEscapeHits.put("m" + O_UMLAUT_S + "\\\\*tley", "mo\\atley"); + + // escaped wildcard then true wildcard + wildcardEscapeHits.put("m" + O_UMLAUT_S + "\\??ley", "mo?tley"); + + // the first is an escaped * which should yield a miss + wildcardEscapeMisses.put("m" + O_UMLAUT_S + "\\*tl*y", "moatley"); + + a = new ToyASCIIAnalyzer(); + } + + @Test + public void testWildCardEscapes() throws ParseException, IOException { + + for (Map.Entry entry : wildcardEscapeHits.entrySet()) { + Query q = getAnalyzedQuery(entry.getKey(), a, false); + assertEquals("WildcardEscapeHits: " + entry.getKey(), true, + isAHit(q, entry.getValue(), a)); + } + for (Map.Entry entry : wildcardEscapeMisses.entrySet()) { + Query q = getAnalyzedQuery(entry.getKey(), a, false); + assertEquals("WildcardEscapeMisses: " + entry.getKey(), false, + isAHit(q, entry.getValue(), a)); + } + + } + + final static class FoldingFilter extends TokenFilter { + final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public FoldingFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char term[] = termAtt.buffer(); + for (int i = 0; i < term.length; i++) + switch (term[i]) { + case O_UMLAUT_C: + term[i] = 'o'; + break; + } + return true; + } else { + return false; + } + } + } + + final static class ToyASCIIAnalyzer extends Analyzer { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, + true); + return new TokenStreamComponents(result, new FoldingFilter(result)); + } + } + + private SpanQuery getAnalyzedQuery(String s, Analyzer a, + boolean allowLeadingWildcard) throws ParseException { + SpanQueryParser qp = new SpanQueryParser(FIELD, a); + qp.setNormMultiTerm(NormMultiTerm.ANALYZE); + qp.setAllowLeadingWildcard(allowLeadingWildcard); + SpanQuery q = qp.parse(s); + return q; + } + + private boolean isAHit(Query q, String content, Analyzer analyzer) + throws IOException { + Directory ramDir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer); + Document doc = new Document(); + FieldType fieldType = new FieldType(); + fieldType.setIndexed(true); + fieldType.setTokenized(true); + fieldType.setStored(true); + Field field = new Field(FIELD, content, fieldType); + doc.add(field); + writer.addDocument(doc); + writer.close(); + DirectoryReader ir = DirectoryReader.open(ramDir); + IndexSearcher is = new IndexSearcher(ir); + + int hits = is.search(q, 10).totalHits; + ir.close(); + ramDir.close(); + if (hits == 1) { + return true; + } else { + return false; + } + } + +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserTest.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserTest.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserTest.java (revision 0) @@ -0,0 +1,806 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.IOException; +import java.io.Reader; + +import static org.apache.lucene.util.automaton.BasicAutomata.makeString; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.span.NormMultiTerm; +import org.apache.lucene.queryparser.span.SpanQueryParser; +import org.apache.lucene.sandbox.queries.SlowFuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class SpanQueryParserTest extends LuceneTestCase { + + private static IndexReader reader; + private static IndexSearcher searcher; + private static Directory directory; + private static Analyzer stopAnalyzer; + private static Analyzer noStopAnalyzer; + private static final String FIELD = "field"; + + private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton( + BasicOperations.union(Arrays.asList(makeString("a"), makeString("an"), + makeString("and"), makeString("are"), makeString("as"), + makeString("at"), makeString("be"), makeString("but"), + makeString("by"), makeString("for"), makeString("if"), + makeString("in"), makeString("into"), makeString("is"), + makeString("it"), makeString("no"), makeString("not"), + makeString("of"), makeString("on"), makeString("or"), + makeString("such"), makeString("that"), makeString("the"), + makeString("their"), makeString("then"), makeString("there"), + makeString("these"), makeString("they"), makeString("this"), + makeString("to"), makeString("was"), makeString("will"), + makeString("with"), makeString("\u5927")))); + + @BeforeClass + public static void beforeClass() throws Exception { + + noStopAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockStandardTokenizerFilter(tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + stopAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockStandardTokenizerFilter(tokenizer); + filter = new MockTokenFilter(filter, STOP_WORDS); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, stopAnalyzer) + .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { + "the quick brown fox ", + "jumped over the lazy brown dog and the brown green cat", + "quick green fox", + "abcdefghijk", + "over green lazy", + // longish doc for recursion test + "eheu fugaces postume postume labuntur anni nec " + + "pietas moram rugis et instanti senectae " + + "adferet indomitaeque morti", + // non-whitespace language + "\u666E \u6797 \u65AF \u987F \u5927 \u5B66" }; + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newTextField(FIELD, docs[i], Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + directory = null; + stopAnalyzer = null; + noStopAnalyzer = null; + } + + @Test + public void testBasic() throws Exception { + + SpanQueryParser p = new SpanQueryParser(FIELD, stopAnalyzer); + + // test null and empty + countSpansDocs(p, null, 0, 0); + countSpansDocs(p, "", 0, 0); + + countSpansDocs(p, "brown", 3, 2); + + } + + @Test + public void testNear() throws Exception { + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + + boolean exc = false; + + try { + SpanQuery q = p.parse("\"brown \"dog\""); + } catch (ParseException e) { + exc = true; + } + assertEquals("unmatching \"", true, exc); + + exc = false; + try { + SpanQuery q = p.parse("[brown [dog]"); + } catch (ParseException e) { + exc = true; + } + assertEquals("unmatched [", true, exc); + + testOffsetForSingleSpanMatch(p, "\"brown dog\"", 1, 4, 6); + + countSpansDocs(p, "\"lazy dog\"", 0, 0); + + testOffsetForSingleSpanMatch(p, "\"lazy dog\"~2", 1, 3, 6); + + testOffsetForSingleSpanMatch(p, "\"lazy dog\"~>2", 1, 3, 6); + + testOffsetForSingleSpanMatch(p, "\"dog lazy\"~2", 1, 3, 6); + + countSpansDocs(p, "\"dog lazy\"~>2", 0, 0); + + testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~10", 1, 3, 11); + + testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~>10", 1, 3, 11); + + countSpansDocs(p, "[cat \"lazy dog\"~>2]~>10", 0, 0); + + // shows that "intervening" for multiple terms is additive + // 3 includes "over the" and "brown" + testOffsetForSingleSpanMatch(p, "[jumped lazy dog]~3", 1, 0, 6); + + // only two words separate each hit, but together, the intervening words > 2 + countSpansDocs(p, "[jumped lazy dog]~2", 0, 0); + + } + + @Test + public void testNotNear() throws Exception { + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + boolean exc = false; + try { + SpanQuery q = p.parse("\"brown dog car\"!~2,2"); + } catch (ParseException e) { + exc = true; + } + assertEquals("must have 2 components", true, exc); + + countSpansDocs(p, "\"brown dog\"!~2,2", 2, 2); + + testOffsetForSingleSpanMatch(p, "\"brown (green dog)\"!~1,1", 0, 2, 3); + + countSpansDocs(p, "\"brown (cat dog)\"!~1,1", 2, 2); + + countSpansDocs(p, "\"brown (quick lazy)\"!~0,4", 3, 2); + + countSpansDocs(p, "\"brown quick\"!~1,4", 2, 1); + + testOffsetForSingleSpanMatch(p, "\"brown (quick lazy)\"!~1,4", 1, 8, 9); + + // test empty + countSpansDocs(p, "\"z y\"!~0,4", 0, 0); + + testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~1,1", 2, 0, 3); + + // traditional SpanNotQuery + testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~", 2, 0, 3); + + } + + @Test + public void testWildcard() throws Exception { + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + boolean exc = false; + + try { + SpanQuery q = p.parse("*og"); + } catch (ParseException e) { + exc = true; + } + assertEquals("no leading wildcards \"", true, exc); + p.setAllowLeadingWildcard(true); + + // lowercasing as default + testOffsetForSingleSpanMatch(p, "*OG", 1, 5, 6); + + p.setNormMultiTerm(NormMultiTerm.NO_NORM); + + countSpansDocs(p, "*OG", 0, 0); + + testOffsetForSingleSpanMatch(p, "*og", 1, 5, 6); + testOffsetForSingleSpanMatch(p, "?og", 1, 5, 6); + + // brown dog and brown fox + countSpansDocs(p, "[brown ?o?]", 2, 2); + countSpansDocs(p, "[br* ?o?]", 2, 2); + } + + @Test + public void testPrefix() throws Exception { + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + + // lowercasing as default + countSpansDocs(p, "BR*", 3, 2); + + countSpansDocs(p, "br*", 3, 2); + + p.setNormMultiTerm(NormMultiTerm.NO_NORM); + countSpansDocs(p, "BR*", 0, 0); + + // not actually a prefix query + countSpansDocs(p, "br?", 0, 0); + + p.setAllowLeadingWildcard(true); + countSpansDocs(p, "*", 39, 7); + + } + + @Test + public void testRegex() throws Exception { + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + SpanQuery q; + boolean exc = false; + try { + q = p.parse("/brown"); + } catch (ParseException e) { + exc = true; + } + assertEquals("mismatching", true, exc); + exc = false; + + countSpansDocs(p, "/b[wor]+n/", 3, 2); + countSpansDocs(p, " /b[wor]+n/ ", 3, 2); + + testOffsetForSingleSpanMatch(p, " [/b[wor]+n/ fox]", 0, 2, 4); + + try { + q = p.parse("[/b[wor]+n/fox]"); + } catch (ParseException e) { + exc = true; + } + assertEquals("must have space after regex", true, exc); + exc = false; + + countSpansDocs(p, " [/b[wor]+n/ (fox dog)]", 2, 2); + + // not lower-casing or normalizing in regex!!! + countSpansDocs(p, "/B[wor]+n/", 0, 0); + + } + + @Test + public void testFuzzy() throws Exception { + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + + countSpansDocs(p, "bruun~", 3, 2); + countSpansDocs(p, "bruun~2", 3, 2); + countSpansDocs(p, "abcdefgh~3", 0, 0); + + p.setFuzzyMaxEdits(3); + testOffsetForSingleSpanMatch(p, "abcdefgh~3", 3, 0, 1); + + // default lowercasing + testOffsetForSingleSpanMatch(p, "Abcdefgh~3", 3, 0, 1); + + p.setNormMultiTerm(NormMultiTerm.NO_NORM); + countSpansDocs(p, "Abcdefgh~3", 0, 0); + } + + @Test + public void testStopWords() throws Exception { + // Stop word handling has some room for improvement with SpanQuery + // These tests codify the expectations (for regular behavior, + // parse exceptions and false hits) as of this writing. + + SpanQueryParser p = new SpanQueryParser(FIELD, stopAnalyzer); + + countSpansDocs(p, "the", 0, 0); + + // these are whittled down to just a query for brown + countSpansDocs(p, "[the brown]", 3, 2); + + countSpansDocs(p, "(the brown)", 3, 2); + + countSpansDocs(p, "[brown the]!~5,5", 3, 2); + + // this should be whittled to a query for "the" + countSpansDocs(p, "[the brown]!~5,5", 0, 0); + + // this will not match because "the" is silently dropped from the query + countSpansDocs(p, "[over the lazy]", 0, 0); + + // this will get one right hit, but incorrectly match "over green lazy" + countSpansDocs(p, "[over the lazy]~1", 2, 2); + + // test throw exception + p.setThrowExceptionForStopWord(true); + boolean exc = false; + try { + countSpansDocs(p, "the", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals("the", true, exc); + + exc = false; + try { + countSpansDocs(p, "[the brown]", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals("[the brown]", true, exc); + + exc = false; + try { + countSpansDocs(p, "(the brown)", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals("(the brown)", true, exc); + + exc = false; + try { + countSpansDocs(p, "[the brown]!~2,2", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals("[the brown]!~2,2", true, exc); + + exc = false; + try { + countSpansDocs(p, "[brown the]!~2,2", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals("[brown the]!~2,2", true, exc); + + // add tests for surprise phrasal with stopword!!! chinese + + SpanQueryParser noStopsParser = new SpanQueryParser(FIELD, noStopAnalyzer); + // won't match because stop word was dropped in index + countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); + // won't match for same reason + countSpansDocs(noStopsParser, "[\u666E\u6797\u65AF\u987F\u5927\u5B66]~2", + 0, 0); + + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E \u6797 \u65AF \u987F \u5B66]~2", 6, 0, 6); + + } + + @Test + public void testNonWhiteSpaceLanguage() throws Exception { + SpanQueryParser noStopsParser = new SpanQueryParser(FIELD, noStopAnalyzer); + + testOffsetForSingleSpanMatch(noStopsParser, "\u666E", 6, 0, 1); + + // default autogenerate phrase queries = true + testOffsetForSingleSpanMatch(noStopsParser, "\u666E\u6797", 6, 0, 2); + + // this would have a hit if autogenerate phrase queries = false + countSpansDocs(noStopsParser, "\u666E\u65AF", 0, 0); + + // treat as "or", this should have two spans + countSpansDocs(noStopsParser, "\u666E \u65AF", 2, 1); + + // stop word removed at indexing time and non existent here, + // this is treated as an exact phrase and should not match + countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5B66", 0, 0); + + // this should be the same as above + countSpansDocs(noStopsParser, "[\u666E \u6797 \u65AF \u987F \u5B66]~0", 0, + 0); + + // look for the same phrase but allow for some slop; this should have one + // hit because this will skip the stop word + + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E \u6797 \u65AF \u987F \u5B66]~1", 6, 0, 6); + + // This tests the #specialHandlingForSpanNearWithOneComponent + // this is initially treated as [ [\u666E\u6797\u65AF\u987F\u5B66]~>0 ]~2 + // with the special treatment, this is rewritten as + // [\u666E \u6797 \u65AF \u987F \u5B66]~1 + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E\u6797\u65AF\u987F\u5B66]~1", 6, 0, 6); + + // this would be the English equivalent, which is technically wrong. + // I went with this method under the belief that a Chinese speaker + // is much more likely to write the above and want this behavior + // than an English speaker is likely to test this silly edge case. + testOffsetForSingleSpanMatch(noStopsParser, "[[lazy dog] ]~4", 1, 3, 6); + + noStopsParser.setAutoGeneratePhraseQueries(false); + + // characters split into 2 tokens and treated as an "or" query + countSpansDocs(noStopsParser, "\u666E\u65AF", 2, 1); + + // TODO: Not sure i like how this behaves. + // this is treated as [(\u666E \u6797 \u65AF \u987F \u5B66)]~2 + // which is then simplified to just: (\u666E \u6797 \u65AF \u987F \u5B66) + // Probably better to be treated as [\u666E \u6797 \u65AF \u987F \u5B66]~2 + + testOffsetForSingleSpanMatch(noStopsParser, + "[\u666E\u6797\u65AF\u987F\u5B66]~1", 6, 0, 6); + + SpanQueryParser stopsParser = new SpanQueryParser(FIELD, stopAnalyzer); + countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); + + // now test for throwing of exception + stopsParser.setThrowExceptionForStopWord(true); + boolean exc = false; + try { + countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); + } catch (ParseException e) { + exc = true; + } + assertEquals(true, exc); + } + + @Test + public void testRecursion() throws Exception { + /* + * For easy reference of expected offsets + * + * 0: eheu 1: fugaces 2: postume 3: postume 4: labuntur 5: anni 6: nec 7: + * pietas 8: moram 9: rugis 10: et 11: instanti 12: senectae 13: adferet 14: + * indomitaeque 15: morti + */ + SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer); + + // String q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5"; + // String q = "[pietas [rug?s senec*]!~2,0 ]~4"; + // countSpansDocs(p, q, 1, 1); + + // Span extents end at one more than the actual end, e.g.: + String q = "fugaces"; + testOffsetForSingleSpanMatch(p, q, 5, 1, 2); + + q = "morti"; + testOffsetForSingleSpanMatch(p, q, 5, 15, 16); + + q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~2"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // not near query for rugis senectae + q = "[labunt* [pietas [rug?s senec*]!~2 ]~4 adferet]~2"; + countSpansDocs(p, q, 0, 0); + + // not near query for rugis senectae, 0 before or 2 after + // Have to extend overall distance to 5 because hit for + // "rug?s senec*" matches only "rug?s" now + q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // not near query for rugis senectae, 0 before or 2 intervening + q = "[labunt* [pietas [rug?s senec*]!~0,2 ]~4 adferet]~5"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // not near query for rugis senectae, 0 before or 3 intervening + q = "[labunt* [pietas [rug?s senec*]!~0,3 ]~4 adferet]~2"; + countSpansDocs(p, q, 0, 0); + + // directionality specified + q = "[labunt* [pietas [rug?s senec*]~>2 ]~>4 adferet]~>2"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // no directionality, query order inverted + q = "[adferet [ [senec* rug?s ]~2 pietas ]~4 labunt*]~2"; + testOffsetForSingleSpanMatch(p, q, 5, 4, 14); + + // more than one word intervenes btwn rugis and senectae + q = "[labunt* [pietas [rug?s senec*]~1 ]~4 adferet]~2"; + countSpansDocs(p, q, 0, 0); + + // more than one word intervenes btwn labuntur and pietas + q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~1"; + countSpansDocs(p, q, 0, 0); + } + + private void countSpansDocs(SpanQueryParser p, String s, int spanCount, + int docCount) throws Exception { + SpanQuery q = p.parse(s); + assertEquals("spanCount: " + s, spanCount, countSpans(q)); + assertEquals("docCount: " + s, docCount, countDocs(q)); + + } + + private long countSpans(SpanQuery q) throws Exception { + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + q = (SpanQuery) q.rewrite(ctx.reader()); + Spans spans = q.getSpans(ctx, null, new HashMap()); + + long i = 0; + while (spans.next()) { + i++; + } + return i; + } + + private long countDocs(SpanQuery q) throws Exception { + OpenBitSet docs = new OpenBitSet(); + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + IndexReaderContext parentCtx = reader.getContext(); + q = (SpanQuery) q.rewrite(ctx.reader()); + + Set qTerms = new HashSet(); + q.extractTerms(qTerms); + Map termContexts = new HashMap(); + + for (Term t : qTerms) { + TermContext c = TermContext.build(parentCtx, t); + termContexts.put(t, c); + } + + Spans spans = q.getSpans(ctx, null, termContexts); + + while (spans.next()) { + docs.set(spans.doc()); + } + long spanDocHits = docs.cardinality(); + // double check with a regular searcher + TotalHitCountCollector coll = new TotalHitCountCollector(); + searcher.search(q, coll); + assertEquals(coll.getTotalHits(), spanDocHits); + return spanDocHits; + + } + + private void testOffsetForSingleSpanMatch(SpanQueryParser p, String s, + int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception { + SpanQuery q = p.parse(s); + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + q = (SpanQuery) q.rewrite(ctx.reader()); + Spans spans = q.getSpans(ctx, null, new HashMap()); + + int i = 0; + int spanStart = -1; + int spanEnd = -1; + int docID = -1; + while (spans.next()) { + spanStart = spans.start(); + spanEnd = spans.end(); + docID = spans.doc(); + i++; + } + assertEquals("should only be one matching span", 1, i); + assertEquals("doc id", trueDocID, docID); + assertEquals("span start", trueSpanStart, spanStart); + assertEquals("span end", trueSpanEnd, spanEnd); + } + + /** + * tests the parser's ability to correctly identify and build an individual + * single/multi-term query + */ + @Test + public void testQueryTermTypeParserBasic() throws Exception { + Analyzer analyzer = new MockAnalyzer(new Random()); + SpanQueryParser p = new SpanQueryParser(FIELD, analyzer); + SpanQuery q = p.buildAnyTermQuery("/f.*/"); + Term t = new Term(FIELD, "f.*"); + SpanQuery ex = new SpanMultiTermQueryWrapper( + new RegexpQuery(t)); + assertEquals("regexp", ex, q); + + q = p.buildAnyTermQuery("fox"); + t = new Term(FIELD, "fox"); + ex = new SpanTermQuery(t); + assertEquals("basic term", ex, q); + + p.setFuzzyMinSim(0.6f); + q = p.buildAnyTermQuery("fox~0.8"); + t = new Term(FIELD, "fox"); + ex = new SpanMultiTermQueryWrapper(new SlowFuzzyQuery(t, + 0.8f)); + assertEquals("fuzzy", ex.toString(), q.toString()); + + // test rounding for fuzzy > 1.0 + p.setFuzzyMaxEdits(4); + q = p.buildAnyTermQuery("fox~3.3"); + t = new Term(FIELD, "fox"); + ex = new SpanMultiTermQueryWrapper(new SlowFuzzyQuery(t, + 3.0f)); + assertEquals("fuzzy", ex.toString(), q.toString()); + + q = p.buildAnyTermQuery("fo*"); + t = new Term(FIELD, "fo"); + assertEquals("prefix *", new SpanMultiTermQueryWrapper( + new PrefixQuery(t)), q); + + q = p.buildAnyTermQuery("fo?"); + t = new Term(FIELD, "fo?"); + assertEquals("prefix looking ?, but actually wildcard", + new SpanMultiTermQueryWrapper(new WildcardQuery(t)), q); + + q = p.buildAnyTermQuery("f*x"); + t = new Term(FIELD, "f*x"); + assertEquals("wildcard *", new SpanMultiTermQueryWrapper( + new WildcardQuery(t)), q); + + q = p.buildAnyTermQuery("f?x"); + t = new Term(FIELD, "f?x"); + assertEquals("wildcard ?", new SpanMultiTermQueryWrapper( + new WildcardQuery(t)), q); + + q = p.buildAnyTermQuery("f?x*"); + t = new Term(FIELD, "f?x*"); + assertEquals("wild card * and ?", + new SpanMultiTermQueryWrapper(new WildcardQuery(t)), q); + + boolean exc = false; + try { + q = p.buildAnyTermQuery("f*x~0.8"); + } catch (ParseException e) { + if (e + .getMessage() + .equals( + "Can't have a single term in a query that is both a wildcard and a fuzzy query")) { + exc = true; + } + } + assertTrue(exc); + + } + + /** + * tests the parser's ability to correctly identify and build an individual + * single/multi-term query with escaped characters + */ + public void testQueryTermTypeParserEscapes() throws Exception { + Analyzer analyzer = new MockAnalyzer(new Random()); + SpanQueryParser p = new SpanQueryParser(FIELD, analyzer); + + SpanQuery q = p.buildAnyTermQuery("fox\\~0.8"); + Term t = new Term(FIELD, "fox\\~0.8"); + assertEquals("fuzzy escaped, actually term", new SpanTermQuery(t), q); + + q = p.buildAnyTermQuery("f\\?x*"); + t = new Term(FIELD, "f\\?x"); + assertEquals("actually prefix", new SpanMultiTermQueryWrapper( + new PrefixQuery(t)), q); + + q = p.buildAnyTermQuery("f\\?x"); + t = new Term(FIELD, "f\\?x"); + assertEquals("escaped ?", new SpanTermQuery(t), q); + + q = p.buildAnyTermQuery("f\\*x"); + t = new Term(FIELD, "f\\*x"); + assertEquals("escaped *", new SpanTermQuery(t), q); + + } + + /** + * Mocks StandardAnalyzer for tokenizing Chinese characters (at least for + * these test cases into individual tokens). + * + */ + private final static class MockStandardTokenizerFilter extends TokenFilter { + // Only designed to handle test cases. You may need to modify this + // if adding new test cases. Note that position increment is hardcoded to be + // 1!!! + private final Pattern hackCJKPattern = Pattern + .compile("([\u5900-\u9899])|([\\p{InBasic_Latin}]+)"); + private List buffer = new LinkedList(); + + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncrAtt; + + public MockStandardTokenizerFilter(TokenStream in) { + super(in); + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + } + + @Override + public final boolean incrementToken() throws java.io.IOException { + if (buffer.size() > 0) { + termAtt.setEmpty().append(buffer.remove(0)); + posIncrAtt.setPositionIncrement(1); + return true; + } else { + boolean next = input.incrementToken(); + if (!next) { + return false; + } + // posIncrAtt.setPositionIncrement(1); + String text = termAtt.toString(); + Matcher m = hackCJKPattern.matcher(text); + boolean hasCJK = false; + while (m.find()) { + if (m.group(1) != null) { + hasCJK = true; + buffer.add(m.group(1)); + } else if (m.group(2) != null) { + buffer.add(m.group(2)); + } + } + if (hasCJK == false) { + // don't change the position increment, the super class will handle + // stop words properly + buffer.clear(); + return true; + } + if (buffer.size() > 0) { + termAtt.setEmpty().append(buffer.remove(0)); + posIncrAtt.setPositionIncrement(1); + } + return true; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + } + } +} Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserRewriteMethodTest.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserRewriteMethodTest.java (revision 0) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserRewriteMethodTest.java (revision 0) @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span; + +import java.io.Reader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.queryparser.span.SpanQueryParser; +import org.apache.lucene.sandbox.queries.SlowFuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util._TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class SpanQueryParserRewriteMethodTest extends LuceneTestCase { + + private static IndexReader reader; + private static IndexSearcher searcher; + private static Directory directory; + private static Analyzer stopAnalyzer; + private static final String FIELD = "field"; + + @BeforeClass + public static void beforeClass() throws Exception { + + stopAnalyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, + true); + TokenFilter filter = new MockTokenFilter(tokenizer, + MockTokenFilter.ENGLISH_STOPSET); + return new TokenStreamComponents(tokenizer, filter); + } + }; + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, stopAnalyzer) + .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { "aaaaaaaaaaaaaaaaaaaaaaa", + "abaaaaaaaaaaaaaaaaaaaaa", "aabaaaaaaaaaaaaaaaaaaaa", + "aaabaaaaaaaaaaaaaaaaaaa", "aaaabaaaaaaaaaaaaaaaaaa", + "aaaaabaaaaaaaaaaaaaaaaa", "aaaaaabaaaaaaaaaaaaaaaa", + "aaaaaaabaaaaaaaaaaaaaaa", "aaaaaaaabaaaaaaaaaaaaaa", + "aaaaaaaaabaaaaaaaaaaaaa", "aaaaaaaaaabaaaaaaaaaaaa", + "aaaaaaaaaaabaaaaaaaaaaa", "aaaaaaaaaaaabaaaaaaaaaa", + "aaaaaaaaaaaaabaaaaaaaaa", "aaaaaaaaaaaaaabaaaaaaaa", + "aaaaaaaaaaaaaaabaaaaaaa", "aaaaaaaaaaaaaaaabaaaaaa", }; + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newTextField(FIELD, docs[i], Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + searcher = new IndexSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + directory = null; + stopAnalyzer = null; + } + + @Test + public void testBasic() throws Exception { + + SpanQueryParser p = new SpanQueryParser(FIELD, stopAnalyzer); + int maxExpansions = 5; + // this works on prefix, wildcard, fuzzy and regex + // it has no effect on max number of boolean clauses in SpanOr + p.setMultiTermRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite( + 5)); + + countSpansDocs(p, "a*", 5, 5); + countSpansDocs(p, "a*a", 5, 5); + countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa~1", 5, 5); + countSpansDocs(p, "/a.*/", 5, 5); + countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa " + "abaaaaaaaaaaaaaaaaaaaaa " + + "aabaaaaaaaaaaaaaaaaaaaa " + "aaabaaaaaaaaaaaaaaaaaaa " + + "aaaabaaaaaaaaaaaaaaaaaa " + "aaaaabaaaaaaaaaaaaaaaaa " + + "aaaaaabaaaaaaaaaaaaaaaa " + "aaaaaaabaaaaaaaaaaaaaaa " + + "aaaaaaaabaaaaaaaaaaaaaa " + "aaaaaaaaabaaaaaaaaaaaaa", 10, 10); + + // this has no effect whatsoever as of this writing. + p.setMultiTermRewriteMethod(new SpanMultiTermQueryWrapper.TopTermsSpanBooleanQueryRewrite( + maxExpansions)); + + countSpansDocs(p, "a*", 17, 17); + countSpansDocs(p, "a*a", 17, 17); + + countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa~1", 17, 17); + countSpansDocs(p, "/a.*/", 17, 17); + countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa " + "abaaaaaaaaaaaaaaaaaaaaa " + + "aabaaaaaaaaaaaaaaaaaaaa " + "aaabaaaaaaaaaaaaaaaaaaa " + + "aaaabaaaaaaaaaaaaaaaaaa " + "aaaaabaaaaaaaaaaaaaaaaa " + + "aaaaaabaaaaaaaaaaaaaaaa " + "aaaaaaabaaaaaaaaaaaaaaa " + + "aaaaaaaabaaaaaaaaaaaaaa " + "aaaaaaaaabaaaaaaaaaaaaa", 10, 10); + + } + + private void countSpansDocs(SpanQueryParser p, String s, int spanCount, + int docCount) throws Exception { + SpanQuery q = p.parse(s); + assertEquals("spanCount: " + s, spanCount, countSpans(q)); + assertEquals("docCount: " + s, docCount, countDocs(q)); + + } + + private long countSpans(SpanQuery q) throws Exception { + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + q = (SpanQuery) q.rewrite(ctx.reader()); + + Spans spans = q.getSpans(ctx, null, new HashMap()); + + long i = 0; + while (spans.next()) { + i++; + } + return i; + } + + private long countDocs(SpanQuery q) throws Exception { + OpenBitSet docs = new OpenBitSet(); + List ctxs = reader.leaves(); + assert (ctxs.size() == 1); + AtomicReaderContext ctx = ctxs.get(0); + IndexReaderContext parentCtx = reader.getContext(); + q = (SpanQuery) q.rewrite(ctx.reader()); + + Set qTerms = new HashSet(); + q.extractTerms(qTerms); + Map termContexts = new HashMap(); + + for (Term t : qTerms) { + TermContext c = TermContext.build(parentCtx, t); + termContexts.put(t, c); + } + + Spans spans = q.getSpans(ctx, null, termContexts); + + while (spans.next()) { + docs.set(spans.doc()); + } + long spanDocHits = docs.cardinality(); + // double check with a regular searcher + TotalHitCountCollector coll = new TotalHitCountCollector(); + searcher.search(q, coll); + assertEquals(coll.getTotalHits(), spanDocHits); + return spanDocHits; + + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfoBuilder.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfoBuilder.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfoBuilder.java (revision 0) @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queryparser.span.clauses; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.span.clauses.ClauseInfo.TYPE; + +/** + * Used internally by SpanQueryParserUtil to build a clause + * + */ +public class ClauseInfoBuilder { + + public static ClauseInfo build(ClauseInfo.START_OR_END startOrEnd, TYPE type, + int start, int end) throws ParseException { + if (type.equals(TYPE.OR)) { + return new ClauseInfo(startOrEnd, start, end); + } else if (type.equals(TYPE.NEAR)) { + return new SpanNearClauseInfo(startOrEnd, start, end); + } else if (type.equals(TYPE.NOT_NEAR)) { + return new SpanNotNearClauseInfo(startOrEnd, start, end); + } + throw new ParseException(String.format( + "I'm sorry, but I don't recognize this type: %s", type)); + } + + public static ClauseInfo build(ClauseInfo.START_OR_END startOrEnd, int start, + int end, int slop, boolean inOrder) { + return new SpanNearClauseInfo(startOrEnd, start, end, slop, inOrder); + } + + public static ClauseInfo build(ClauseInfo.START_OR_END startOrEnd, int start, + int end, int pre, int post) { + return new SpanNotNearClauseInfo(startOrEnd, start, end, pre, post); + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNearClauseInfo.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNearClauseInfo.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNearClauseInfo.java (revision 0) @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queryparser.span.clauses; + +public class SpanNearClauseInfo extends ClauseInfo { + private final static TYPE type = TYPE.NEAR; + private final static int DEFAULT_SLOP = 0; + private final static boolean DEFAULT_INORDER = true; + + private final boolean inOrder; + private final int slop; + + public SpanNearClauseInfo(START_OR_END which, int start, int end) { + super(which, start, end); + this.inOrder = DEFAULT_INORDER; + this.slop = DEFAULT_SLOP; + } + + public SpanNearClauseInfo(START_OR_END which, int start, int end, int slop, + boolean inOrder) { + super(which, start, end); + this.slop = slop; + this.inOrder = inOrder; + } + + public int getSlop() { + return slop; + } + + public boolean getInOrder() { + return inOrder; + } + + public TYPE getType() { + return type; + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNotNearClauseInfo.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNotNearClauseInfo.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNotNearClauseInfo.java (revision 0) @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queryparser.span.clauses; + +public class SpanNotNearClauseInfo extends ClauseInfo { + private final static int DEFAULT_BEFORE = 0; + private final static int DEFAULT_AFTER = 0; + + private final static TYPE type = TYPE.NOT_NEAR; + + private final int before; + private final int after; + + public SpanNotNearClauseInfo(START_OR_END which, int start, int end) { + super(which, start, end); + this.before = DEFAULT_BEFORE; + this.after = DEFAULT_AFTER; + } + + public SpanNotNearClauseInfo(START_OR_END which, int start, int end, int slop) { + super(which, start, end); + this.before = slop; + this.after = slop; + } + + public SpanNotNearClauseInfo(START_OR_END which, int start, int end, + int before, int after) { + super(which, start, end); + this.before = before; + this.after = after; + } + + public int getBefore() { + return before; + } + + public int getAfter() { + return after; + } + + public TYPE getType() { + return type; + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfo.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfo.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfo.java (revision 0) @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span.clauses; + +/** + * Type of clause: or, near or not near + * + */ +public class ClauseInfo { + public enum START_OR_END { + START, END + }; + + public enum TYPE { + OR, // spanor + NEAR, // spannear + NOT_NEAR // spannotnear + }; + + private final int start; + private final int end; + private final START_OR_END startOrEnd; + private final static TYPE type = TYPE.OR; + + public ClauseInfo(START_OR_END which, int start, int end) { + this.startOrEnd = which; + this.start = start; + this.end = end; + } + + public TYPE getType() { + return type; + } + + public int getEnd() { + return end; + } + + public int getStart() { + return start; + } + + public START_OR_END getStartOrEnd() { + return startOrEnd; + } + + public static boolean matchTypes(TYPE startType, TYPE endType) { + // this is effectively directional! + if (startType.equals(endType)) { + return true; + } + if (startType.equals(TYPE.NEAR) + && (endType.equals(TYPE.NEAR) || endType.equals(TYPE.NOT_NEAR))) { + return true; + } + return false; + } + + public static boolean matchOpenClose(ClauseInfo openMarker, + ClauseInfo closeMarker) { + if (openMarker.getStartOrEnd().equals(START_OR_END.START) + && closeMarker.getStartOrEnd().equals(START_OR_END.END) + && matchTypes(openMarker.getType(), closeMarker.getType())) { + return true; + } + + return false; + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserBase.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserBase.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserBase.java (revision 0) @@ -0,0 +1,899 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParserBase; +import org.apache.lucene.queryparser.span.clauses.ClauseInfo; +import org.apache.lucene.queryparser.span.clauses.SpanNearClauseInfo; +import org.apache.lucene.queryparser.span.clauses.SpanNotNearClauseInfo; +import org.apache.lucene.queryparser.span.clauses.ClauseInfo.TYPE; +import org.apache.lucene.sandbox.queries.SlowFuzzyQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +/** + * Following the lead of {@link QueryParserBase}, this separates most of the + * functionality for creating a {@link SpanQuery} from the actual parser. This + * should allow for easy additions of niftier jj or antlr or ?? parsers than the + * current SpanQueryParser. + * + */ +public abstract class SpanQueryParserBase { + + private final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); + + private NormMultiTerm normMultiTerm = NormMultiTerm.LOWERCASE; + private MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; + private boolean allowLeadingWildcard = false; + + private Analyzer analyzer; + private String field; + private int phraseSlop = 0; + private float fuzzyMinSim = 0.6f;// instead of FuzzyQuery.defaultMinSimilarity + // 2.0f; + private int fuzzyMaxEdits = FuzzyQuery.defaultMaxEdits; + private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; + private int spanNearMaxDistance = 100; + private int spanNotNearMaxDistance = 50; + private boolean throwExceptionForStopWord = false; + private Locale locale = Locale.getDefault(); + + boolean autoGeneratePhraseQueries = true; + + /** + * Initializes a span query parser. Called by the SpanQueryParser constructor + * + * @param f + * the field for query terms. + * @param a + * used to find (and normalize) terms in the query text. + */ + public void init(String f, Analyzer a) { + analyzer = a; + field = f; + } + + /** + * Parsers must implement this. + * + * @param s + * @return {@link SpanQuery} or null if nothing could be parsed + * @throws ParseException + */ + public abstract SpanQuery parse(String s) throws ParseException; + + /** + * Not currently called by parser + * + * @param s + * @return + */ + public static String escape(String s) { + return QueryParserBase.escape(s); + } + + /** + * @see #setMultiTermRewriteMethod(org.apache.lucene.search.MultiTermQuery.RewriteMethod) + * @return + */ + public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() { + return multiTermRewriteMethod; + } + + /** + * + * By default QueryParser uses + * {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * when creating a {@link PrefixQuery}, {@link WildcardQuery}, + * {@link TermRangeQuery}, and {@link RegexpQuery}. This implementation is + * generally preferable because it a) Runs faster b) Does not have the + * scarcity of terms unduly influence score c) avoids any + * {@link TooManyClauses} exception. + * + * To set the max number of rewrites to a number higher than the default, use: + * MultiTermQuery.TopTermsScoringBooleanQueryRewrite(x) or similar. + * + * Beware: as of this writing, + * SpanMultiTermQueryWrapper.TopTermsSpanBooleanQueryRewrite(maxExpansions) + * has no effect. + */ + public void setMultiTermRewriteMethod( + MultiTermQuery.RewriteMethod multiTermRewriteMethod) { + this.multiTermRewriteMethod = multiTermRewriteMethod; + } + + /** + * @see #setAllowLeadingWildcard(boolean) + * @return + */ + public boolean getAllowLeadingWildcard() { + return allowLeadingWildcard; + } + + /** + * Set to true to allow leading wildcard characters. + *

+ * When set, * or ? are allowed as the first + * character of a PrefixQuery and WildcardQuery. Note that this can produce + * very slow queries on big indexes. + *

+ * Default: false. + */ + public void setAllowLeadingWildcard(boolean allowLeadingWildcard) { + this.allowLeadingWildcard = allowLeadingWildcard; + } + + /** + * @see #setAnalyzer(Analyzer) + * @return + */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** + * + * @param analyzer + * set analyzer to use in parser + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } + + /** + * @see #setField(String) + * @return + */ + public String getField() { + return field; + } + + /** + * + * @param field + * which field to parse + */ + public void setField(String field) { + this.field = field; + } + + /** + * @see #setPhraseSlop(int) + * @return + */ + public int getPhraseSlop() { + return phraseSlop; + } + + /** + * + * @param phraseSlop + * default slop to use in phrases + */ + public void setPhraseSlop(int phraseSlop) { + this.phraseSlop = phraseSlop; + } + + /** + * @see #setFuzzyMinSim(float) + * @return + */ + public float getFuzzyMinSim() { + return fuzzyMinSim; + } + + /** + * For a fuzzy query, if the fuzzy value is < 1.0f this will be + * the minimum allowable similarity. For example, if this is set to 0.8f and a + * query of salmonella~0.6 is parsed, the resulting query will be for + * "salmonella" with a fuzziness of 0.8f. Default is 0.6f. + * + * However, if the fuzzy value is >= 1.0f, then @see + * #setFuzzyMaxEdits(int). + * + * @param fuzzyMinSim + */ + public void setFuzzyMinSim(float fuzzyMinSim) { + this.fuzzyMinSim = fuzzyMinSim; + } + + /** + * @see #setFuzzyPrefixLength(int) + * @return + */ + public int getFuzzyPrefixLength() { + return fuzzyPrefixLength; + } + + /** + * + * @param fuzzyPrefixLength + * prefix length to use in fuzzy queries. Default is 0. + */ + public void setFuzzyPrefixLength(int fuzzyPrefixLength) { + this.fuzzyPrefixLength = fuzzyPrefixLength; + } + + /** + * @see #setFuzzyMaxEdits(int) + * @return + */ + public int getFuzzyMaxEdits() { + return fuzzyMaxEdits; + } + + /** + * maximum number of edits allowed in a fuzzy query. BEWARE if this is + * set to anything greater than 2, you'll be out of Automaton land and into + * brute force land (vintage Lucene <=3.x) This could wreak serious + * performance problems. + * + * @param fuzzyMaxEdits + */ + public void setFuzzyMaxEdits(int fuzzyMaxEdits) { + this.fuzzyMaxEdits = fuzzyMaxEdits; + } + + /** + * @see #setLocale(Locale) + * @return + */ + public Locale getLocale() { + return locale; + } + + /** + * So far, only used in lowercasing of multiterm queries. + * + * @param locale + */ + public void setLocale(Locale locale) { + this.locale = locale; + } + + public boolean getAutoGeneratePhraseQueries() { + return autoGeneratePhraseQueries; + } + + public void setAutoGeneratePhraseQueries(boolean b) { + autoGeneratePhraseQueries = b; + } + + /** + * When the parser comes across a simple single term, it runs the term through + * the analyzer. This is called by + * {@link #buildSingleTermQuery(String, String). Override this for custom + * handling. In whitespace languages, the returned array will most likely have + * a length of 1. It can have a length of 0 if a stop word was passed in as + * termText; The array may contain null values if the {@link Analyzer} breaks + * the apparently single term into multiple terms and there is a stopword. To + * identify stop words, the Analyzer must have a + * {@link PositionIncrementAttribute}. If it doesn't, this will silently hide + * the nulls and not add them to the array. + * + * @param termText + * apparently simple single term + * @return array of {@link String} for the parts that the analyzer broke this + * into. + * + */ + protected String[] analyzeSimpleSingleTerm(String termText) + throws ParseException { + List chunks = new LinkedList(); + try { + TokenStream ts = analyzer.tokenStream(field, termText); + ts.reset(); + CharTermAttribute cAttr = ts.getAttribute(CharTermAttribute.class); + PositionIncrementAttribute posAttr = null; + if (ts.hasAttribute(PositionIncrementAttribute.class)) { + posAttr = ts.getAttribute(PositionIncrementAttribute.class); + } + while (ts.incrementToken()) { + chunks.add(cAttr.toString()); + if (posAttr != null) { + for (int i = 1; i < posAttr.getPositionIncrement(); i++) { + chunks.add(null); + } + } + } + ts.end(); + ts.close(); + } catch (IOException e) { + throw new ParseException( + String.format(getLocale(), + "IOException while trying to parse %s : %s", termText, + e.getMessage())); + } + + return chunks.toArray(new String[chunks.size()]); + } + + protected SpanQuery buildRegexTermQuery(String field, String termText) { + RegexpQuery regexQuery = new RegexpQuery(new Term(field, termText)); + regexQuery.setRewriteMethod(multiTermRewriteMethod); + return new SpanMultiTermQueryWrapper(regexQuery); + } + + /** + * + * @param field + * @param termText + * @param slop + * if < 1.0f, this will be treated as the old minSim, if >= 1.0f, + * this will be rounded and treated as maxEdits + * @return + * @throws ParseException + */ + protected SpanQuery buildFuzzyTermQuery(String field, String termText, + float slop) throws ParseException { + String normalized = termText; + + switch (normMultiTerm) { + case NO_NORM: + break; + case LOWERCASE: + normalized = normalized.toLowerCase(getLocale()); + break; + case ANALYZE: + normalized = analyzeSingleChunk(field, termText, normalized); + break; + } + + if (slop == 0.0f) { + return new SpanTermQuery(new Term(field, normalized)); + } + // if the user enters 2.4 for example, round it so that there won't be an + // illegalparameter exception + if (slop >= 1.0f) { + slop = (float) Math.round(slop); + } + + // set the max slop + if (slop < 1.0f && slop < fuzzyMinSim) { + slop = fuzzyMinSim; + } else if (slop > 1.0f && slop > fuzzyMaxEdits) { + slop = fuzzyMaxEdits; + } + // SlowFuzzyQuery defaults to the Automaton if maxEdits is <= 2. + // We don't have to reinvent that wheel. + SlowFuzzyQuery fuzzyQuery = new SlowFuzzyQuery(new Term(field, normalized), + slop, fuzzyPrefixLength); + fuzzyQuery.setRewriteMethod(multiTermRewriteMethod); + + return new SpanMultiTermQueryWrapper(fuzzyQuery); + + } + + /** + * @param field + * @param termText + * should literally be the prefix. It should not end with *. + * @return + * @throws ParseException + */ + protected SpanQuery buildPrefixQuery(String field, String termText) + throws ParseException { + // could happen with a simple * query + testLeadingWildcard(termText); + + String normalized = termText; + + switch (normMultiTerm) { + case NO_NORM: + break; + case LOWERCASE: + normalized = normalized.toLowerCase(locale); + break; + case ANALYZE: + normalized = analyzeSingleChunk(field, termText, normalized); + break; + } + + PrefixQuery query = new PrefixQuery(new Term(field, normalized)); + query.setRewriteMethod(multiTermRewriteMethod); + return new SpanMultiTermQueryWrapper(query); + + } + + protected SpanQuery buildWildcardQuery(String field, String termText) + throws ParseException { + testLeadingWildcard(termText); + String normalized = termText; + + switch (normMultiTerm) { + case NO_NORM: + break; + case LOWERCASE: + normalized = normalized.toLowerCase(locale); + break; + case ANALYZE: + normalized = analyzeWildcard(field, termText); + break; + } + WildcardQuery wildcardQuery = new WildcardQuery(new Term(field, normalized)); + wildcardQuery.setRewriteMethod(multiTermRewriteMethod); + return new SpanMultiTermQueryWrapper(wildcardQuery); + + } + + /** + * build what appears to be a simple single term query. If the analyzer breaks + * it into multiple terms, treat that as a "phrase" or as an "or" depending on + * the value of {@link #autoGeneratePhraseQueries}. + * + * @param field + * @param termText + * @return + * @throws ParseException + */ + protected SpanQuery buildSingleTermQuery(String field, String termText) + throws ParseException { + String[] terms = analyzeSimpleSingleTerm(termText); + if (terms.length == 0) { + if (throwExceptionForStopWord) { + throw new ParseException( + String + .format( + "No terms returned after I tried to normalize what I thought was a single term: %s", + termText)); + } else { + return getEmptyQuery(); + } + } else if (terms.length == 1) { + return new SpanTermQuery(new Term(field, terms[0])); + } else { + // if the analyzer broke this into more than one term, + // treat it as a boolean query or as a + // span near query with no slop and in order + // depending on the value of autoGeneratePhraseQueries + + List nonEmpties = new LinkedList(); + for (String piece : terms) { + if (piece != null) { + nonEmpties.add(new SpanTermQuery(new Term(field, piece))); + } else if (piece == null && throwExceptionForStopWord) { + throw new ParseException("Stop word found in " + termText); + } + } + + if (nonEmpties.size() == 0) { + return getEmptyQuery(); + } + if (nonEmpties.size() == 1) { + return nonEmpties.get(0); + } + + SpanQuery[] queries = nonEmpties + .toArray(new SpanQuery[nonEmpties.size()]); + if (getAutoGeneratePhraseQueries() == true) { + return new SpanNearQuery(queries, 0, true); + } else { + return new SpanOrQuery(queries); + } + } + } + + /** + * Does this start with a wildcard and is that allowed? + * + * @param termText + * @throws ParseException + */ + private void testLeadingWildcard(String termText) throws ParseException { + if (allowLeadingWildcard == false + && (termText.startsWith("*") || termText.startsWith("?"))) { + throw new ParseException( + "'*' or '?' not allowed as first character in WildcardQuery with current configuration."); + } + } + + /** + * Will build a SpanQuery clause from components and the ClauseInfo + * + * @param clauses + * @param clauseInfo + * @return + * @throws ParseException + */ + protected SpanQuery buildQuery(List clauses, ClauseInfo clauseInfo) + throws ParseException { + + if (clauseInfo.getType().equals(TYPE.OR)) { + return buildSpanOrQuery(clauses); + } else if (clauseInfo.getType().equals(TYPE.NEAR)) { + SpanNearClauseInfo tmp = (SpanNearClauseInfo) clauseInfo; + return buildSpanNearQuery(clauses, tmp.getSlop(), tmp.getInOrder()); + } else if (clauseInfo.getType().equals(TYPE.NOT_NEAR)) { + SpanNotNearClauseInfo tmp = (SpanNotNearClauseInfo) clauseInfo; + return buildSpanNotNearQuery(clauses, tmp.getBefore(), tmp.getAfter()); + } + throw new ParseException( + String + .format("I don't know how to build a query for a clause of type: %s" + + clauseInfo.getType())); + } + + /** + * + * @param clauses + * @return {@link SpanOrQuery} might be empty if clauses is null or contains + * only empty queries + */ + protected SpanQuery buildSpanOrQuery(List clauses) { + if (clauses == null || clauses.size() == 0) + return getEmptyQuery(); + + List nonEmpties = removeEmpties(clauses); + if (nonEmpties.size() == 0) { + return getEmptyQuery(); + } + if (nonEmpties.size() == 1) + return nonEmpties.get(0); + + SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]); + return new SpanOrQuery(arr); + + } + + /** + * + * @param clauses + * @return {@link SpanOrQuery} or null if clauses is null or empty + */ + protected SpanQuery buildSpanNearQuery(List clauses, int slop, + boolean inOrder) { + if (clauses == null || clauses.size() == 0) + return getEmptyQuery(); + + List nonEmpties = removeEmpties(clauses); + + if (nonEmpties.size() == 0) { + return getEmptyQuery(); + } + + if (slop > spanNearMaxDistance) { + slop = spanNearMaxDistance; + } + + if (nonEmpties.size() == 1) { + return specialHandlingForSpanNearWithOneComponent(nonEmpties.get(0), + slop, inOrder); + } + + SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]); + return new SpanNearQuery(arr, slop, inOrder); + } + + /** + * This is meant to "fix" two cases that might be surprising to a + * non-whitespace language speaker. If a user entered, e.g. "\u5927\u5B66"~3, + * and {@link #autoGeneratePhraseQueries} is set to true, then the parser + * would treat this recursively and yield [[\u5927\u5B66]]~3. The user + * probably meant find those two characters within three words of each other, + * not find those right next to each other and that hit has to be within three + * words of nothing. + * + * If a user entered the same thing and {@link #autoGeneratePhraseQueries} is + * set to false, then the parser would treat this as [(\u5927\u5B66)]~3: find + * one character or the other and then that hit has to be within three words + * of nothing. + * + * This special handling does create incorrect handling for whitespace + * languages. [[quick fox]]~1 should only match on "quick fox" and it will now + * match on "fox green quick". + * + * The current method was chosen because the former use case is probably far + * more common than the latter. + * + * @param spanQuery + * this is the sole child of a SpanNearQuery + * @param slop + * slop from the parent + * @param inOrder + * inOrder from the parent + * @return + */ + private SpanQuery specialHandlingForSpanNearWithOneComponent( + SpanQuery spanQuery, int slop, boolean inOrder) { + + if (spanQuery instanceof SpanNearQuery && autoGeneratePhraseQueries == true) { + SpanNearQuery q = (SpanNearQuery) spanQuery; + if (q.isInOrder() && q.getSlop() == 0) { + + SpanQuery[] children = q.getClauses(); + // if the grandchildren aren't all SpanTermQueries + // then this can't be the edge case for the special handling + for (SpanQuery c : children) { + if (!(c instanceof SpanTermQuery)) { + return spanQuery; + } + } + return new SpanNearQuery(children, slop, inOrder); + } + } else if (spanQuery instanceof SpanOrQuery + && autoGeneratePhraseQueries == false) { + SpanOrQuery q = (SpanOrQuery) spanQuery; + SpanQuery[] children = q.getClauses(); + for (SpanQuery c : children) { + if (!(c instanceof SpanTermQuery)) { + return spanQuery; + } + } + return new SpanNearQuery(children, slop, inOrder); + } + return spanQuery; + } + + /** + * + * @param clauses + * @return {@link SpanOrQuery} + */ + protected SpanQuery buildSpanNotNearQuery(List clauses, int pre, + int post) throws ParseException { + if (clauses.size() != 2) { + throw new ParseException( + String.format("SpanNotNear query must have two clauses. I count %d", + clauses.size())); + } + // if include is an empty query, treat this as just an empty query + if (isEmptyQuery(clauses.get(0))) { + return clauses.get(0); + } + // if exclude is an empty query, return include alone + if (isEmptyQuery(clauses.get(1))) { + return clauses.get(0); + } + + if (pre > spanNotNearMaxDistance) { + pre = spanNotNearMaxDistance; + } + if (post > spanNotNearMaxDistance) { + post = spanNotNearMaxDistance; + } + return new SpanNotQuery(clauses.get(0), clauses.get(1), pre, post); + } + + private List removeEmpties(List queries) { + + List nonEmpties = new ArrayList(); + for (SpanQuery q : queries) { + if (!isEmptyQuery(q)) { + nonEmpties.add(q); + } + } + return nonEmpties; + } + + /** + * @see #setNormMultiTerm(NormMultiTerm) + * @return + */ + public NormMultiTerm getNormMultiTerm() { + return normMultiTerm; + } + + /** + * Set how the parser should analyze multiterms: {@link FuzzyQuery}, + * {@link RegexpQuery}, {@link WildcardQuery}. + * + *

+ * Warning (copied from AnalyzingQueryParser): + * {@link NormMultiTerm.ANALZYE} should only be used with analyzers that do + * not use stopwords or that add tokens. Also, several stemming analyzers are + * inappropriate: for example, GermanAnalyzer will turn + * Häuser into hau, but H?user + * will become h?user when using this parser and thus no match + * would be found (i.e. using this parser will be no improvement over + * {@link NormMultiTerm.LOWERCASE} in such cases). + * + * @param normMultiTerm + */ + public void setNormMultiTerm(NormMultiTerm normMultiTerm) { + this.normMultiTerm = normMultiTerm; + } + + /** + * Sets an upper limit on the maximum distance for a not near query. If + * {@link #spanNotNearMaxDistance} is set to 10, and a query of "foo bar"~20,8 + * is parsed, the returned query will be for a {@link SpanNotQuery} with a pre + * value of 10 and a post value of 8. The default is 50. + * + * @param dist + * the max distance + */ + public void setSpanNotNearMaxDistance(int dist) { + assert (dist >= 0); + spanNotNearMaxDistance = dist; + } + + /** + * @see #setSpanNearMaxDistance(int) + */ + public int getSpanNotNearMaxDistance() { + return spanNotNearMaxDistance; + } + + /** + * @see #setSpanNearMaxDistance(int) + */ + public int getSpanNearMaxDistance() { + return spanNearMaxDistance; + } + + /** + * Sets an upper limit on the maximum distance for a phrase/near query. If + * {@link #spanNearMaxDistance} is set to 10, and a query of "foo bar"~20 is + * parsed, the returned query will be for a {@link SpanNearQuery} with a + * distance of 10. The default is 100. + * + * @param dist + * the max distance + */ + public void setSpanNearMaxDistance(int dist) { + assert (dist >= 0); + spanNearMaxDistance = dist; + } + + /** + * @see #getThrowExceptionForStopWord() + */ + public boolean getThrowExceptionForStopWord() { + return throwExceptionForStopWord; + } + + /** + * If a stopword is encountered during parsing, should the parser throw a + * ParseException or ignore the stopword? + * + * @param toThrowOrNotToThrow + */ + public void setThrowExceptionForStopWord(boolean toThrowOrNotToThrow) { + throwExceptionForStopWord = toThrowOrNotToThrow; + } + + /** + * Returns the analyzed form for the given chunk + * + * If the analyzer produces more than one output token from the given chunk, a + * ParseException is thrown. + * + * @param field + * The target field + * @param termStr + * The full term from which the given chunk is excerpted + * @param chunk + * The portion of the given termStr to be analyzed + * @return The result of analyzing the given chunk + * @throws ParseException + * when analysis returns other than one output token + */ + protected String analyzeSingleChunk(String field, String termStr, String chunk) + throws ParseException { + // plagiarized from AnalyzingQueryParser + String analyzed = null; + TokenStream stream = null; + try { + stream = getAnalyzer().tokenStream(field, chunk); + stream.reset(); + CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); + // get first and hopefully only output token + if (stream.incrementToken()) { + analyzed = termAtt.toString(); + + // try to increment again, there should only be one output token + StringBuilder multipleOutputs = null; + while (stream.incrementToken()) { + if (null == multipleOutputs) { + multipleOutputs = new StringBuilder(); + multipleOutputs.append('"'); + multipleOutputs.append(analyzed); + multipleOutputs.append('"'); + } + multipleOutputs.append(','); + multipleOutputs.append('"'); + multipleOutputs.append(termAtt.toString()); + multipleOutputs.append('"'); + } + stream.end(); + stream.close(); + if (null != multipleOutputs) { + throw new ParseException(String.format(getLocale(), + "Analyzer created multiple terms for \"%s\": %s", chunk, + multipleOutputs.toString())); + } + } else { + // nothing returned by analyzer. Was it a stop word and the user + // accidentally + // used an analyzer with stop words? + stream.end(); + stream.close(); + throw new ParseException(String.format(getLocale(), + "Analyzer returned nothing for \"%s\"", chunk)); + } + } catch (IOException e) { + throw new ParseException(String.format(getLocale(), + "IO error while trying to analyze single term: \"%s\"", termStr)); + } + return analyzed; + } + + private String analyzeWildcard(String field, String termText) + throws ParseException { + // plagiarized from AnalyzingQueryParser + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termText); + StringBuilder sb = new StringBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + // continue if escaped char + if (wildcardMatcher.group(1) != null) { + continue; + } + + if (wildcardMatcher.start() > 0) { + String chunk = termText.substring(last, wildcardMatcher.start()); + String analyzed = analyzeSingleChunk(field, termText, chunk); + sb.append(analyzed); + } + // append the wildcard character + sb.append(wildcardMatcher.group(2)); + + last = wildcardMatcher.end(); + } + if (last < termText.length()) { + sb.append(analyzeSingleChunk(field, termText, termText.substring(last))); + } + return sb.toString(); + } + + private SpanQuery getEmptyQuery() { + SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]); + return q; + } + + private boolean isEmptyQuery(SpanQuery q) { + if (q instanceof SpanOrQuery && ((SpanOrQuery) q).getClauses().length == 0) { + return true; + } + return false; + } + +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserUtil.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserUtil.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserUtil.java (revision 0) @@ -0,0 +1,492 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queryparser.span; + +import java.util.HashSet; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.span.clauses.ClauseInfo; +import org.apache.lucene.queryparser.span.clauses.ClauseInfoBuilder; + +/** + * Utility class to to handle parsing + * + */ + +public class SpanQueryParserUtil { + + private final static String nearOpen = "["; + private final static String nearClose = "]"; + private final static String orOpen = "("; + private final static String orClose = ")"; + + private final Set requiredRegexPres; + private final Set requiredRegexPosts; + private final static String clauseString = "\\" + nearOpen + "\\" + nearClose + + "\\" + orOpen + "\\" + orClose; + private final static String numberString = "(?:(\\d+)(?:,(\\d+))?)?"; + private final static Pattern CLAUSE_PATTERN = Pattern.compile("(?s)([" + + clauseString + "])(?:(!?~>?)" + numberString + ")?"); + + private final static Pattern ESCAPES_PATTERN = Pattern.compile("(\\\\.)"); + private final static Pattern REGEX_PATTERN = Pattern.compile("(\\\\/|/)"); + private final static Pattern WHITE_SPACE_PATTERN = Pattern.compile("\\s+"); + private final static Pattern WHITE_SPACE_ONLY_PATTERN = Pattern + .compile("^\\s*$"); + + public SpanQueryParserUtil() { + requiredRegexPres = new HashSet(); + requiredRegexPres.add("("); + requiredRegexPres.add("["); + requiredRegexPres.add(" "); + + requiredRegexPosts = new HashSet(); + requiredRegexPosts.add(")"); + requiredRegexPosts.add("]"); + requiredRegexPosts.add(" "); + } + + /** + * rewrite double quotes to [ ] + * + * @param s + * string + * @param escapedChars + * escaped characters + * @return rewritten query + */ + protected String rewriteDoubleQuotes(String s, Set escapedChars) { + Matcher dMatcher = Pattern.compile("\"").matcher("");// " + int last = 0; + dMatcher.reset(s); + int i = 0; + StringBuilder sb = new StringBuilder(); + while (dMatcher.find()) { + if (escapedChars.contains(dMatcher.start())) { + continue; + } + sb.append(s.substring(last, dMatcher.start())); + if (i % 2 == 0) { + sb.append("["); + } else { + sb.append("]"); + } + last = dMatcher.end(); + i++; + } + sb.append(s.substring(last)); + return sb.toString(); + } + + /** + * get a list of all clause markers + * + * @param s + * full query string + * @param regexes + * list of regex extents + * @param escapedChars + * set of escaped characters + * @return list of start and end clause markers + * @throws ParseException + */ + protected List getClauseMarkers(String s, + List regexes, Set escapedChars) + throws ParseException { + Set inRegex = new HashSet(); + for (OffsetAttribute regex : regexes) { + for (int i = regex.startOffset(); i < regex.endOffset(); i++) { + inRegex.add(i); + } + } + + Matcher clauseMatcher = CLAUSE_PATTERN.matcher(s); + List markers = new ArrayList(); + while (clauseMatcher.find()) { + if (inRegex.contains(clauseMatcher.start())) { + + continue; + } + if (escapedChars.contains(clauseMatcher.start())) { + + continue; + } + + ClauseInfo marker = buildClauseMarker(clauseMatcher); + markers.add(marker); + } + + return markers; + } + + /** + * builds ClauseMarker based on a matcher that has a match to a clause marker + * + * @param m + * @return + * @throws ParseException + */ + private ClauseInfo buildClauseMarker(Matcher m) throws ParseException { + ClauseInfo.TYPE type = null; + ClauseInfo.START_OR_END startOrEnd = null; + + // basic marker + String marker = m.group(1); + + // near or not near + String whichNear = m.group(2); + + // first bit of digits as in the "2" in [foo bar]~2" + String firstDigits = m.group(3); + // second bit of digits as in the "3" in [foo bar]!~2,3" + // should only be non-null for not near query + String secondDigits = m.group(4); + int offsetStart = m.start(); + int offsetEnd = m.end(); + + if (marker.equals(nearOpen)) { + type = ClauseInfo.TYPE.NEAR; + startOrEnd = ClauseInfo.START_OR_END.START; + } else if (marker.equals(orOpen)) { + type = ClauseInfo.TYPE.OR; + startOrEnd = ClauseInfo.START_OR_END.START; + } else if (marker.equals(orClose)) { + type = ClauseInfo.TYPE.OR; + startOrEnd = ClauseInfo.START_OR_END.END; + } else if (marker.equals(nearClose) && whichNear == null) { + type = ClauseInfo.TYPE.NEAR; + startOrEnd = ClauseInfo.START_OR_END.END; + } + if (type != null && startOrEnd != null) { + return ClauseInfoBuilder.build(startOrEnd, type, offsetStart, offsetEnd); + } + // spanNotNear + if (whichNear.startsWith("!")) { + if (firstDigits == null) { + return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, + ClauseInfo.TYPE.NOT_NEAR, offsetStart, offsetEnd); + } + + // there is a single slop value + if (firstDigits != null && secondDigits == null) { + int slop = 0; + try { + slop = Integer.parseInt(firstDigits); + } catch (NumberFormatException e) { + throw new ParseException( + String + .format( + "There should have been an integer here in span not near query: %s", + firstDigits)); + } + return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, + offsetStart, offsetEnd, slop, slop); + } + if (firstDigits != null && secondDigits != null) { + int pre = 0; + try { + pre = Integer.parseInt(firstDigits); + } catch (NumberFormatException e) { + throw new ParseException( + String + .format( + "There should have been an integer here in span not near query: %s", + firstDigits)); + } + int post = 0; + try { + post = Integer.parseInt(secondDigits); + } catch (NumberFormatException e) { + throw new ParseException( + String + .format( + "There should have been an integer here in span not near query: %s", + secondDigits)); + } + + return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, + offsetStart, offsetEnd, pre, post); + } + + } else { + boolean inOrder = false; + // [foo bar]~ matches "foo bar" and "bar foo" + if (whichNear.equals("~") && firstDigits == null) { + return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, + offsetStart, offsetEnd, 0, inOrder); + } + + if (whichNear.endsWith(">")) { + inOrder = true; + } + // [foo bar]~> + if (firstDigits == null) { + return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, + offsetStart, offsetEnd, 0, inOrder); + } + int slop = 0; + try { + slop = Integer.parseInt(firstDigits); + } catch (NumberFormatException e) { + throw new ParseException( + "A span-near query should have a slop value of only one number."); + } + return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, offsetStart, + offsetEnd, slop, inOrder); + } + throw new ParseException(String.format( + "Failed to parse: %s and its attributes", marker)); + + } + + /** + * From a full list of clause markers for the string, find the closing clause + * marker for the marker at {start} + * + * @param clauses + * @param start + * @return offset in the list of clausemarkers where the match is + * @throws ParseException + * if the matching marker couldn't be found + */ + protected int findMatching(List clauses, int start) + throws ParseException { + ClauseInfo startMarker = clauses.get(start); + int depth = 0; + for (int i = start; i < clauses.size(); i++) { + if (ClauseInfo.matchOpenClose(startMarker, clauses.get(i))) { + depth++; + } else if (startMarker.getType().equals(clauses.get(i).getType())) { + depth--; + } + + if (depth == 0) + return i; + + if (depth > 0) + throw new ParseException("too many end markers"); + + } + throw new ParseException("couldn't find matching clause markers"); + } + + /** + * Extracts terms from a string that may contain regexes but contains no + * clausal boundaries. For example, you would pass "foo /bat/ bar" from the + * larger query [ pre [ foo /bat/ bar] post]~10 This extracts terms from the + * target span of a full string. + * + * @param s + * @param targetSpanStart + * @param targetSpanEnd + * @param regexes + * list of regexes within the full string + * @param escapedChars + * set of escaped chars within the full string + * @return + */ + protected List extractTermStringsBasedOnWhitespace(String s, + int targetSpanStart, int targetSpanEnd, List regexes, + Set escapedChars) { + // This is meant to extract terms from a string that may contain regexes + // but contains no clausal boundaries. + // This extracts Terms from the target span of a full String. + List terms = new ArrayList(); + + // end early if start == end + if (targetSpanStart >= targetSpanEnd) { + return terms; + } + int tmpStart = targetSpanStart; + for (OffsetAttribute regex : regexes) { + if (regex.endOffset() < targetSpanStart) { + continue; + } else if (regex.startOffset() > targetSpanEnd) { + break; + } + if (regex.startOffset() - 1 >= 0) { + // extract terms before the regex + List tmp = extractTermsBasedOnWhitespace(s, tmpStart, + (regex.startOffset() - 1), escapedChars); + terms.addAll(tmp); + // extract regex + terms.add(s.substring(regex.startOffset(), regex.endOffset() + 1)); + } + + tmpStart = regex.endOffset() + 1; + } + // extract terms after regex + List tmp = extractTermsBasedOnWhitespace(s, tmpStart, + targetSpanEnd, escapedChars); + terms.addAll(tmp); + return terms; + } + + /** + * This is meant to extract terms from a string that contains no clausal + * boundaries and no regexes. + * + * This extracts Strings from the target span of a full String. This unescapes + * the strings. + * + * You still need to use an analyzer for non-whitespace languages to break up + * the returned strings into tokens + * + * @param s + * @param targetSpanStart + * @param targetSpanEnd + * @param escapedChars + * @return + */ + + protected List extractTermsBasedOnWhitespace(String s, + int targetSpanStart, int targetSpanEnd, Set escapedChars) { + List termStrings = new ArrayList(); + // stop early if the start and end are == + if (targetSpanStart >= targetSpanEnd) { + return termStrings; + } + Matcher whiteSpaceSplitter = WHITE_SPACE_PATTERN.matcher(s); + Matcher unescaper = ESCAPES_PATTERN.matcher(""); + Matcher whiteSpaceOnly = WHITE_SPACE_ONLY_PATTERN.matcher(""); + int start = targetSpanStart; + int last = start; + + whiteSpaceSplitter = whiteSpaceSplitter.region(targetSpanStart, + targetSpanEnd); + while (whiteSpaceSplitter.find()) { + start = whiteSpaceSplitter.end(); + if (escapedChars.contains(whiteSpaceSplitter.start())) { + continue; + } + String tmp = s.substring(last, whiteSpaceSplitter.start()); + if (!whiteSpaceOnly.reset(tmp).find()) { + unescaper.reset(tmp); + tmp = unescaper.replaceAll("$1"); + + termStrings.add(tmp); + } + last = whiteSpaceSplitter.end(); + } + String tmp = s.substring(last, targetSpanEnd); + if (!whiteSpaceOnly.reset(tmp).find()) { + unescaper.reset(tmp); + tmp = unescaper.replaceAll("$1"); + termStrings.add(tmp); + } + return termStrings; + } + + /** + * Extracts the regex extents within the string + * + * @param s + * @param escapedChars + * @return + * @throws ParseException + */ + protected List extractRegexes(String s, + Set escapedChars) throws ParseException { + List offsets = new ArrayList(); + Matcher m = REGEX_PATTERN.matcher(s); + boolean inRegex = false; + int start = -1; + while (m.find()) { + if (m.group(1).equals("/") && !escapedChars.contains(m.start())) { + + if (inRegex == true && testRegexPost(s, m.start(), escapedChars)) { + OffsetAttribute offset = new OffsetAttributeImpl(); + // really, we mean it, leave in the -1 + offset.setOffset(start, m.end() - 1); + offsets.add(offset); + inRegex = false; + } else { + if (testRegexPre(s, m.start(), escapedChars)) { + inRegex = true; + start = m.start(); + } + } + } + } + if (inRegex == true) { + throw new ParseException("Unmatched / in regex"); + } + + return offsets; + } + + /** + * test that the character before the regex looks like a regex boundary + * + * @param s + * @param i + * @param escapedChars + * @return + */ + private boolean testRegexPre(String s, int i, Set escapedChars) { + int pre = i - 1; + if (pre < 0) { + return true; + } else if (escapedChars.contains(pre)) { + return false; + } else if (requiredRegexPres.contains(s.substring(pre, pre + 1))) { + return true; + } + return false; + } + + /** + * test that the character after the regex looks like a regex boundary + * + * @param s + * @param i + * @param escapedChars + * @return + */ + private boolean testRegexPost(String s, int i, Set escapedChars) { + int post = i + 1; + // if term ends string + if (post >= s.length() - 1) { + return true; + } else if (escapedChars.contains(post)) { + return false; + } else if (post + 1 < s.length() + && requiredRegexPosts.contains(s.substring(post, post + 1))) { + return true; + } + return false; + } + + protected Set getEscapedExtents(String s) { + Set ints = new HashSet(); + Matcher m = ESCAPES_PATTERN.matcher(s); + + while (m.find()) { + ints.add(m.start()); + ints.add(m.start() + 1); + } + return ints; + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParser.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParser.java (revision 0) @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.span.clauses.ClauseInfo; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.spans.SpanQuery; + +/** + *

+ * Parses a query into a {@link SpanQuery} which can be used to fetch + * {@link Span}s or with IndexSearcher. This parser includes functionality from: + *

    + *
  • {@link org.apache.lucene.queryparser.classic.QueryParser classic + * QueryParser}: most of its syntax
  • + *
  • {@link org.apache.lucene.queryparser.surround.parser.QueryParser + * SurroundQueryParser}: recursive parsing for "near" and "not" clauses.
  • + *
  • {@link ComplexPhraseQueryParser}: can handle "near" queries that include + * multiterms ({@link WildcardQuery}, {@link FuzzyQuery}, {@link RegexpQuery}).
  • + *
  • {@link AnalyzingQueryParser}: has an option to analyze multiterms.
  • + *
+ * + *

+ * + *

+ * Background This parser was developed for the concordance/analytic + * search use case -- the user wants to see every time a span occurs (perhaps + * with a separate FilterQuery). While the SpanQuery that this parser generates + * can be used as a Query for traditional information retrieval via + * IndexSearcher, this syntax offers far more power than the classic syntax , + * and it may not be needed in the general IR use case. + * + *

+ * + *

+ * With luck, this parser will be made obsolete with Lucene-2878, but until + * then, this parser fills a niche. + *

+ *

+ * One goal was to keep the syntax as close to Lucene's classic + * {@link QueryParser} as possible. + *

+ *

+ * Similarities and Differences + *

+ * + *

+ * Same as classic syntax: + *

    + *
  • term: test
  • + *
  • fuzzy: roam~0.8, roam~2
  • + *
  • wildcard: te?t, test*, t*st
  • + *
  • regex: /[mb]oat/
  • + *
  • phrase: "jakarta apache"
  • + *
  • phrase with slop: "jakarta apache"~3
  • + *
  • "or" clauses: jakarta apache
  • + *
  • grouping clauses: (jakarta apache)
  • + *
+ *

+ *

+ * Main additions in SpanQueryParser syntax vs. classic syntax: + *

    + *
  • Can require "in order" for phrases with slop with the ~> operator: + * "jakarta apache"~>3
  • + *
  • Can specify "not near" "bieber fever"!~3,10 :: find + * "bieber" but not if "fever" appears within 3 words before + * or 10 words after it.
  • + *
  • Fully recursive phrasal queries with [ and ]; as in: [[jakarta apache]~3 + * lucene]~>4 :: find "jakarta" within 3 words of "apache", + * and that hit has to be within four words before "lucene".
  • + *
  • Can also use [] for single level phrasal queries instead of "" + * as in: [jakarta apache]
  • + *
  • Can use "or" clauses in phrasal queries: "apache (lucene + * solr)"~3 :: find "apache" and then either "lucene" + * or "solr" within three words.
  • + *
  • Can use multiterms in phrasal queries: "jakarta~1 ap*che"~2
  • + *
  • Did I mention recursion: [[jakarta~1 ap*che]~2 (solr~ + * /l[ou]+[cs][en]+/)]~10 :: Find something like "jakarta" within two + * words of "ap*che" and that hit has to be within ten words of + * something like "solr" or that lucene regex.
  • + *
+ * + * + * + *

+ * Limitations of SpanQueryParser compared with classic QueryParser: + *

    + *
  1. SpanQueryParser can create a query for only one field.
  2. + *
  3. Boolean queries are not allowed. There is no AND operator; statements + * with more than one term are either "or'd" or handled in proximity + * queries
  4. + *
  5. Boosting is not currently supported
  6. + *
  7. {@link RangeQuery}s are not yet supported.
  8. + *
  9. This parser is not built with .jj or the antlr parser framework. + * Regrettably, because it is generating a {@link SpanQuery}, it can't use all + * of the generalizable queryparser infrastructure that was added with Lucene + * 4.+.
  10. + *
+ *

+ *

+ * Stop word handling + *

+ *

+ * The user can choose to throw a {@link ParseException} if a stop word is + * encountered. If {@link SpanQueryParserBase.throwExceptionForStopWord} is set + * to false (default), the following should happen. + *

+ *

+ *

    + *
  • Term: "the" will return an empty {@link BooleanSpanQuery} (similar to + * classic queryparser)
  • + *
  • SpanOr: (the apache jakarta) will drop the stop word and return a + * {@link SpanOrQuery} for "apache" or "jakarta" + *
  • SpanNear: "apache and jakarta" will drop the "and" and match on only + * "apache jakarta" + *
  • + *
+ *

+ *

+ * Expert: Other subtle differences between SpanQueryParser and classic + * QueryParser. + *

    + *
  • Fuzzy queries with slop > 2 are handled by SlowFuzzyQuery. The developer + * can set the fuzzyMaxEdits.
  • + *
  • Regex term queries must currently be preceded or followed by a + * parenthesis, a square bracket, white space or the start or end of the string. + *
      + *
    • "jakarta /ap[aeiou]*che/" is allowed
    • + *
    • "jakarta (/ap[aeiou]*che/ /lucene?/)" is allowed
    • + *
    • "jakarta/ap[aeiou]*che/" is not allowed
    • + *
    + *
  • + *
  • Fuzzy queries with edit distance >=1 are rounded so that an exception is + * not thrown.
  • + *
+ *

+ *

+ * NOTE You must add the sandbox jar to your class path to include the + * currently deprecated {@link SlowFuzzyQuery}. + *

+ * + */ +public class SpanQueryParser extends SpanQueryParserBase { + + private static final Pattern FUZZY_PATTERN = Pattern + .compile("(?s)^(.+)~(\\d+)?(?:\\.(\\d+))?$"); + private static final Pattern WILDCARD_PATTERN = Pattern.compile("([?*])"); + private static final Pattern REGEX_PATTERN = Pattern + .compile("(?s)^\\/(.+?)\\/$"); + private static final Pattern ESCAPE_PATTERN = Pattern.compile("\\\\.");// ."); + + /** + * Initialize with field and analyzer. This parser can only process a single + * field. It will use the analyzer for normalizing query terms and for + * tokenizing character runs from non-whitespace languages. + * + * @param field + * @param analyzer + */ + public SpanQueryParser(String field, Analyzer analyzer) { + init(field, analyzer); + } + + /** + * returns {@link SpanQuery} or null if an empty string or no parseable + * content is passed in. + */ + public SpanQuery parse(String s) throws ParseException { + SpanQueryParserUtil parserUtil = new SpanQueryParserUtil(); + // treat every query as if it were a big spanOr + // there is an unsettling, yet small inefficiency to this; fix if solution + // is obvious + + StringBuilder sb = new StringBuilder(); + sb.append("(").append(s).append(")"); + s = sb.toString(); + Set escapedChars = parserUtil.getEscapedExtents(s); + s = parserUtil.rewriteDoubleQuotes(s, escapedChars); + List regexes = parserUtil.extractRegexes(s, escapedChars); + List clauses = parserUtil.getClauseMarkers(s, regexes, + escapedChars); + + return parse(parserUtil, getField(), s, 0, clauses, regexes, escapedChars); + } + + private SpanQuery parse(SpanQueryParserUtil util, String field, String s, + int startMarkerIndex, List clauseMarkers, + List regexes, Set escapedChars) + throws ParseException { + + if (s == null || s.length() == 0) + return null; + + ClauseInfo startMarker = clauseMarkers.get(startMarkerIndex); + int endMarkerIndex = util.findMatching(clauseMarkers, startMarkerIndex); + ClauseInfo endMarker = clauseMarkers.get(endMarkerIndex); + + List queryClauses = new ArrayList(); + + int childStartInd = startMarkerIndex + 1; + int childEndInd = -1; + int lastStartChar = startMarker.getEnd(); + + while (childStartInd < endMarkerIndex) { + + childEndInd = util.findMatching(clauseMarkers, childStartInd); + + // handle the stuff before the clauseMarkers + int tmpStart = lastStartChar; + int tmpEnd = clauseMarkers.get(childStartInd).getStart(); + List preTermQueries = parseBasicTerms(util, field, s, + tmpStart, tmpEnd, regexes, escapedChars); + for (SpanQuery q : preTermQueries) { + queryClauses = addQuery(q, queryClauses); + } + SpanQuery tmpQ = parse(util, field, s, childStartInd, clauseMarkers, + regexes, escapedChars); + queryClauses = addQuery(tmpQ, queryClauses); + lastStartChar = clauseMarkers.get(childEndInd).getEnd(); + childStartInd = childEndInd + 1; + + } + + int endInd = (childEndInd > -1) ? childEndInd : startMarkerIndex; + int contentOffsetStart = clauseMarkers.get(endInd).getEnd(); + int contentOffsetEnd = endMarker.getStart(); + List postTermQueries = parseBasicTerms(util, field, s, + contentOffsetStart, contentOffsetEnd, regexes, escapedChars); + for (SpanQuery q : postTermQueries) { + queryClauses = addQuery(q, queryClauses); + } + + return buildQuery(queryClauses, endMarker); + } + + private List parseBasicTerms(SpanQueryParserUtil util, + String field, String s, int start, int end, + List regexes, Set escapedChars) + throws ParseException { + + List termStrings = util.extractTermStringsBasedOnWhitespace(s, + start, end, regexes, escapedChars); + + return convertTermStringsToSpanQueries(field, termStrings); + } + + private List addQuery(SpanQuery q, List list) { + if (null != q) + list.add(q); + return list; + } + + /** + * Simply convert termStrs to SpanQueries. + * + * @param field + * @param strings + * @return + * @throws ParseException + */ + private List convertTermStringsToSpanQueries(String field, + List strings) throws ParseException { + List terms = new ArrayList(); + for (String s : strings) { + SpanQuery tmpT = buildAnyTermQuery(s); + if (tmpT != null) { + terms = addQuery(tmpT, terms); + } + } + return terms; + } + + /** + * This identifies and then builds the various span term and/or multiterm + * queries. Protected for testing purposes. + * + *

+ * For {@link FuzzyQuery}, this defaults to {@link FuzzyQuery.defaultMaxEdits} + * if no value is specified after the ~. + * + * @param termText + * @return SpanQuery or null if termText is a stop word + * @throws ParseException + * @throws IOException + */ + protected SpanQuery buildAnyTermQuery(String termText) throws ParseException { + // TODO: add range query + // is this a regex term? + Matcher m = REGEX_PATTERN.matcher(termText); + if (m.find()) { + return buildRegexTermQuery(getField(), m.group(1)); + } + + Set escapes = new HashSet(); + m = ESCAPE_PATTERN.matcher(termText); + while (m.find()) { + escapes.add(m.end() - 1); + } + SpanQuery q = null; + + // is this a fuzzy term? + m = FUZZY_PATTERN.matcher(termText); + if (m.find()) { + String term = m.group(1); + // if this is not actually an escaped fuzzy marker!!! + if (!escapes.contains(m.end(1))) { + + String slopString = m.group(2); + String decimalComponent = m.group(3); + float slop = (float) FuzzyQuery.defaultMaxEdits; + if (slopString != null) { + if (decimalComponent == null || decimalComponent.length() == 0) { + decimalComponent = "0"; + } + try { + slop = Float.parseFloat(slopString + "." + decimalComponent); + } catch (NumberFormatException e) { + // shouldn't ever happen. If it does, fall back to original value of + // slop + // swallow + } + + } + // if the user enters 2.4 for example, round it so that there won't be + // an + // illegalparameter exception + if (slop >= 1.0f) { + slop = (float) Math.round(slop); + } + q = buildFuzzyTermQuery(getField(), term, slop); + } + } + + // is this a wildcard term? + m = WILDCARD_PATTERN.matcher(termText); + Set ws = new HashSet(); + while (m.find()) { + if (!escapes.contains(m.start())) { + ws.add(m.start()); + } + } + if (ws.size() > 0) { + if (q != null) { + throw new ParseException( + "Can't have a single term in a query that is both a wildcard and a fuzzy query"); + } + + if (ws.size() == 1 // there's only one wildcard character + && ws.contains(termText.length() - 1) // it isn't escaped + && termText.indexOf("*") == termText.length() - 1 // it is * not ? + ) { + // snip final * + q = buildPrefixQuery(getField(), + termText.substring(0, termText.length() - 1)); + } else { + q = buildWildcardQuery(getField(), termText); + } + } + + // if you've found anything, return it + if (q != null) { + return q; + } + // treat as basic single term query + return buildSingleTermQuery(getField(), termText); + } +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/NormMultiTerm.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/NormMultiTerm.java (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/NormMultiTerm.java (revision 0) @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queryparser.span; + +/** + * + * options for handling a multiterm: wildcard, prefix, fuzzy (not regexp!) + * + */ +public enum NormMultiTerm { + ANALYZE, LOWERCASE, NO_NORM +} Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/package.html =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/package.html (revision 0) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/package.html (revision 0) @@ -0,0 +1,24 @@ + + + + +SpanQueryParser generates a SpanQuery, which can be used to grab spans or in +traditional document retrieval with IndexSearcher. + + +