Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/span/AnalyzingSpanQueryParserTest.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/span/AnalyzingSpanQueryParserTest.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/span/AnalyzingSpanQueryParserTest.java (revision 0)
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.span.NormMultiTerm;
+import org.apache.lucene.queryparser.span.SpanQueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+/**
+ * Adopted/Plagiarized largely from SpanQueryParser tests.
+ *
+ */
+public class AnalyzingSpanQueryParserTest extends LuceneTestCase {
+
+ private static final String O_UMLAUT_S = "\u00F6";
+ private static final char O_UMLAUT_C = '\u00F6';
+ private static final String FIELD = "field";
+
+ private Analyzer a;
+ private Map wildcardEscapeHits = new TreeMap();
+ private Map wildcardEscapeMisses = new TreeMap();
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+
+ wildcardEscapeHits.put("m" + O_UMLAUT_S + "tley", "motley");
+
+ wildcardEscapeHits.put("m" + O_UMLAUT_S + "*tley", "moatley");
+
+ // need to have at least one genuine wildcard to trigger the wildcard
+ // analysis
+ // hence the * before the y
+ wildcardEscapeHits.put("m" + O_UMLAUT_S + "\\*tl*y", "mo*tley");
+
+ // escaped backslash then true wildcard
+ wildcardEscapeHits.put("m" + O_UMLAUT_S + "\\\\*tley", "mo\\atley");
+
+ // escaped wildcard then true wildcard
+ wildcardEscapeHits.put("m" + O_UMLAUT_S + "\\??ley", "mo?tley");
+
+ // the first is an escaped * which should yield a miss
+ wildcardEscapeMisses.put("m" + O_UMLAUT_S + "\\*tl*y", "moatley");
+
+ a = new ToyASCIIAnalyzer();
+ }
+
+ @Test
+ public void testWildCardEscapes() throws ParseException, IOException {
+
+ for (Map.Entry entry : wildcardEscapeHits.entrySet()) {
+ Query q = getAnalyzedQuery(entry.getKey(), a, false);
+ assertEquals("WildcardEscapeHits: " + entry.getKey(), true,
+ isAHit(q, entry.getValue(), a));
+ }
+ for (Map.Entry entry : wildcardEscapeMisses.entrySet()) {
+ Query q = getAnalyzedQuery(entry.getKey(), a, false);
+ assertEquals("WildcardEscapeMisses: " + entry.getKey(), false,
+ isAHit(q, entry.getValue(), a));
+ }
+
+ }
+
+ final static class FoldingFilter extends TokenFilter {
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public FoldingFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char term[] = termAtt.buffer();
+ for (int i = 0; i < term.length; i++)
+ switch (term[i]) {
+ case O_UMLAUT_C:
+ term[i] = 'o';
+ break;
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ final static class ToyASCIIAnalyzer extends Analyzer {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE,
+ true);
+ return new TokenStreamComponents(result, new FoldingFilter(result));
+ }
+ }
+
+ private SpanQuery getAnalyzedQuery(String s, Analyzer a,
+ boolean allowLeadingWildcard) throws ParseException {
+ SpanQueryParser qp = new SpanQueryParser(FIELD, a);
+ qp.setNormMultiTerm(NormMultiTerm.ANALYZE);
+ qp.setAllowLeadingWildcard(allowLeadingWildcard);
+ SpanQuery q = qp.parse(s);
+ return q;
+ }
+
+ private boolean isAHit(Query q, String content, Analyzer analyzer)
+ throws IOException {
+ Directory ramDir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
+ Document doc = new Document();
+ FieldType fieldType = new FieldType();
+ fieldType.setIndexed(true);
+ fieldType.setTokenized(true);
+ fieldType.setStored(true);
+ Field field = new Field(FIELD, content, fieldType);
+ doc.add(field);
+ writer.addDocument(doc);
+ writer.close();
+ DirectoryReader ir = DirectoryReader.open(ramDir);
+ IndexSearcher is = new IndexSearcher(ir);
+
+ int hits = is.search(q, 10).totalHits;
+ ir.close();
+ ramDir.close();
+ if (hits == 1) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserTest.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserTest.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserTest.java (revision 0)
@@ -0,0 +1,806 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.io.IOException;
+import java.io.Reader;
+
+import static org.apache.lucene.util.automaton.BasicAutomata.makeString;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.span.NormMultiTerm;
+import org.apache.lucene.queryparser.span.SpanQueryParser;
+import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.TotalHitCountCollector;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class SpanQueryParserTest extends LuceneTestCase {
+
+ private static IndexReader reader;
+ private static IndexSearcher searcher;
+ private static Directory directory;
+ private static Analyzer stopAnalyzer;
+ private static Analyzer noStopAnalyzer;
+ private static final String FIELD = "field";
+
+ private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton(
+ BasicOperations.union(Arrays.asList(makeString("a"), makeString("an"),
+ makeString("and"), makeString("are"), makeString("as"),
+ makeString("at"), makeString("be"), makeString("but"),
+ makeString("by"), makeString("for"), makeString("if"),
+ makeString("in"), makeString("into"), makeString("is"),
+ makeString("it"), makeString("no"), makeString("not"),
+ makeString("of"), makeString("on"), makeString("or"),
+ makeString("such"), makeString("that"), makeString("the"),
+ makeString("their"), makeString("then"), makeString("there"),
+ makeString("these"), makeString("they"), makeString("this"),
+ makeString("to"), makeString("was"), makeString("will"),
+ makeString("with"), makeString("\u5927"))));
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+
+ noStopAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockStandardTokenizerFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ stopAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockStandardTokenizerFilter(tokenizer);
+ filter = new MockTokenFilter(filter, STOP_WORDS);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, stopAnalyzer)
+ .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] {
+ "the quick brown fox ",
+ "jumped over the lazy brown dog and the brown green cat",
+ "quick green fox",
+ "abcdefghijk",
+ "over green lazy",
+ // longish doc for recursion test
+ "eheu fugaces postume postume labuntur anni nec "
+ + "pietas moram rugis et instanti senectae "
+ + "adferet indomitaeque morti",
+ // non-whitespace language
+ "\u666E \u6797 \u65AF \u987F \u5927 \u5B66" };
+ for (int i = 0; i < docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD, docs[i], Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcher = new IndexSearcher(reader);
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ directory = null;
+ stopAnalyzer = null;
+ noStopAnalyzer = null;
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+
+ SpanQueryParser p = new SpanQueryParser(FIELD, stopAnalyzer);
+
+ // test null and empty
+ countSpansDocs(p, null, 0, 0);
+ countSpansDocs(p, "", 0, 0);
+
+ countSpansDocs(p, "brown", 3, 2);
+
+ }
+
+ @Test
+ public void testNear() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+
+ boolean exc = false;
+
+ try {
+ SpanQuery q = p.parse("\"brown \"dog\"");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("unmatching \"", true, exc);
+
+ exc = false;
+ try {
+ SpanQuery q = p.parse("[brown [dog]");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("unmatched [", true, exc);
+
+ testOffsetForSingleSpanMatch(p, "\"brown dog\"", 1, 4, 6);
+
+ countSpansDocs(p, "\"lazy dog\"", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "\"lazy dog\"~2", 1, 3, 6);
+
+ testOffsetForSingleSpanMatch(p, "\"lazy dog\"~>2", 1, 3, 6);
+
+ testOffsetForSingleSpanMatch(p, "\"dog lazy\"~2", 1, 3, 6);
+
+ countSpansDocs(p, "\"dog lazy\"~>2", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~10", 1, 3, 11);
+
+ testOffsetForSingleSpanMatch(p, "[\"lazy dog\"~>2 cat]~>10", 1, 3, 11);
+
+ countSpansDocs(p, "[cat \"lazy dog\"~>2]~>10", 0, 0);
+
+ // shows that "intervening" for multiple terms is additive
+ // 3 includes "over the" and "brown"
+ testOffsetForSingleSpanMatch(p, "[jumped lazy dog]~3", 1, 0, 6);
+
+ // only two words separate each hit, but together, the intervening words > 2
+ countSpansDocs(p, "[jumped lazy dog]~2", 0, 0);
+
+ }
+
+ @Test
+ public void testNotNear() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+ boolean exc = false;
+ try {
+ SpanQuery q = p.parse("\"brown dog car\"!~2,2");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("must have 2 components", true, exc);
+
+ countSpansDocs(p, "\"brown dog\"!~2,2", 2, 2);
+
+ testOffsetForSingleSpanMatch(p, "\"brown (green dog)\"!~1,1", 0, 2, 3);
+
+ countSpansDocs(p, "\"brown (cat dog)\"!~1,1", 2, 2);
+
+ countSpansDocs(p, "\"brown (quick lazy)\"!~0,4", 3, 2);
+
+ countSpansDocs(p, "\"brown quick\"!~1,4", 2, 1);
+
+ testOffsetForSingleSpanMatch(p, "\"brown (quick lazy)\"!~1,4", 1, 8, 9);
+
+ // test empty
+ countSpansDocs(p, "\"z y\"!~0,4", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~1,1", 2, 0, 3);
+
+ // traditional SpanNotQuery
+ testOffsetForSingleSpanMatch(p, "[[quick fox]~3 brown]!~", 2, 0, 3);
+
+ }
+
+ @Test
+ public void testWildcard() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+ boolean exc = false;
+
+ try {
+ SpanQuery q = p.parse("*og");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("no leading wildcards \"", true, exc);
+ p.setAllowLeadingWildcard(true);
+
+ // lowercasing as default
+ testOffsetForSingleSpanMatch(p, "*OG", 1, 5, 6);
+
+ p.setNormMultiTerm(NormMultiTerm.NO_NORM);
+
+ countSpansDocs(p, "*OG", 0, 0);
+
+ testOffsetForSingleSpanMatch(p, "*og", 1, 5, 6);
+ testOffsetForSingleSpanMatch(p, "?og", 1, 5, 6);
+
+ // brown dog and brown fox
+ countSpansDocs(p, "[brown ?o?]", 2, 2);
+ countSpansDocs(p, "[br* ?o?]", 2, 2);
+ }
+
+ @Test
+ public void testPrefix() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+
+ // lowercasing as default
+ countSpansDocs(p, "BR*", 3, 2);
+
+ countSpansDocs(p, "br*", 3, 2);
+
+ p.setNormMultiTerm(NormMultiTerm.NO_NORM);
+ countSpansDocs(p, "BR*", 0, 0);
+
+ // not actually a prefix query
+ countSpansDocs(p, "br?", 0, 0);
+
+ p.setAllowLeadingWildcard(true);
+ countSpansDocs(p, "*", 39, 7);
+
+ }
+
+ @Test
+ public void testRegex() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+ SpanQuery q;
+ boolean exc = false;
+ try {
+ q = p.parse("/brown");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("mismatching", true, exc);
+ exc = false;
+
+ countSpansDocs(p, "/b[wor]+n/", 3, 2);
+ countSpansDocs(p, " /b[wor]+n/ ", 3, 2);
+
+ testOffsetForSingleSpanMatch(p, " [/b[wor]+n/ fox]", 0, 2, 4);
+
+ try {
+ q = p.parse("[/b[wor]+n/fox]");
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("must have space after regex", true, exc);
+ exc = false;
+
+ countSpansDocs(p, " [/b[wor]+n/ (fox dog)]", 2, 2);
+
+ // not lower-casing or normalizing in regex!!!
+ countSpansDocs(p, "/B[wor]+n/", 0, 0);
+
+ }
+
+ @Test
+ public void testFuzzy() throws Exception {
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+
+ countSpansDocs(p, "bruun~", 3, 2);
+ countSpansDocs(p, "bruun~2", 3, 2);
+ countSpansDocs(p, "abcdefgh~3", 0, 0);
+
+ p.setFuzzyMaxEdits(3);
+ testOffsetForSingleSpanMatch(p, "abcdefgh~3", 3, 0, 1);
+
+ // default lowercasing
+ testOffsetForSingleSpanMatch(p, "Abcdefgh~3", 3, 0, 1);
+
+ p.setNormMultiTerm(NormMultiTerm.NO_NORM);
+ countSpansDocs(p, "Abcdefgh~3", 0, 0);
+ }
+
+ @Test
+ public void testStopWords() throws Exception {
+ // Stop word handling has some room for improvement with SpanQuery
+ // These tests codify the expectations (for regular behavior,
+ // parse exceptions and false hits) as of this writing.
+
+ SpanQueryParser p = new SpanQueryParser(FIELD, stopAnalyzer);
+
+ countSpansDocs(p, "the", 0, 0);
+
+ // these are whittled down to just a query for brown
+ countSpansDocs(p, "[the brown]", 3, 2);
+
+ countSpansDocs(p, "(the brown)", 3, 2);
+
+ countSpansDocs(p, "[brown the]!~5,5", 3, 2);
+
+ // this should be whittled to a query for "the"
+ countSpansDocs(p, "[the brown]!~5,5", 0, 0);
+
+ // this will not match because "the" is silently dropped from the query
+ countSpansDocs(p, "[over the lazy]", 0, 0);
+
+ // this will get one right hit, but incorrectly match "over green lazy"
+ countSpansDocs(p, "[over the lazy]~1", 2, 2);
+
+ // test throw exception
+ p.setThrowExceptionForStopWord(true);
+ boolean exc = false;
+ try {
+ countSpansDocs(p, "the", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("the", true, exc);
+
+ exc = false;
+ try {
+ countSpansDocs(p, "[the brown]", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("[the brown]", true, exc);
+
+ exc = false;
+ try {
+ countSpansDocs(p, "(the brown)", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("(the brown)", true, exc);
+
+ exc = false;
+ try {
+ countSpansDocs(p, "[the brown]!~2,2", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("[the brown]!~2,2", true, exc);
+
+ exc = false;
+ try {
+ countSpansDocs(p, "[brown the]!~2,2", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals("[brown the]!~2,2", true, exc);
+
+ // add tests for surprise phrasal with stopword!!! chinese
+
+ SpanQueryParser noStopsParser = new SpanQueryParser(FIELD, noStopAnalyzer);
+ // won't match because stop word was dropped in index
+ countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
+ // won't match for same reason
+ countSpansDocs(noStopsParser, "[\u666E\u6797\u65AF\u987F\u5927\u5B66]~2",
+ 0, 0);
+
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E \u6797 \u65AF \u987F \u5B66]~2", 6, 0, 6);
+
+ }
+
+ @Test
+ public void testNonWhiteSpaceLanguage() throws Exception {
+ SpanQueryParser noStopsParser = new SpanQueryParser(FIELD, noStopAnalyzer);
+
+ testOffsetForSingleSpanMatch(noStopsParser, "\u666E", 6, 0, 1);
+
+ // default autogenerate phrase queries = true
+ testOffsetForSingleSpanMatch(noStopsParser, "\u666E\u6797", 6, 0, 2);
+
+ // this would have a hit if autogenerate phrase queries = false
+ countSpansDocs(noStopsParser, "\u666E\u65AF", 0, 0);
+
+ // treat as "or", this should have two spans
+ countSpansDocs(noStopsParser, "\u666E \u65AF", 2, 1);
+
+ // stop word removed at indexing time and non existent here,
+ // this is treated as an exact phrase and should not match
+ countSpansDocs(noStopsParser, "\u666E\u6797\u65AF\u987F\u5B66", 0, 0);
+
+ // this should be the same as above
+ countSpansDocs(noStopsParser, "[\u666E \u6797 \u65AF \u987F \u5B66]~0", 0,
+ 0);
+
+ // look for the same phrase but allow for some slop; this should have one
+ // hit because this will skip the stop word
+
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E \u6797 \u65AF \u987F \u5B66]~1", 6, 0, 6);
+
+ // This tests the #specialHandlingForSpanNearWithOneComponent
+ // this is initially treated as [ [\u666E\u6797\u65AF\u987F\u5B66]~>0 ]~2
+ // with the special treatment, this is rewritten as
+ // [\u666E \u6797 \u65AF \u987F \u5B66]~1
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E\u6797\u65AF\u987F\u5B66]~1", 6, 0, 6);
+
+ // this would be the English equivalent, which is technically wrong.
+ // I went with this method under the belief that a Chinese speaker
+ // is much more likely to write the above and want this behavior
+ // than an English speaker is likely to test this silly edge case.
+ testOffsetForSingleSpanMatch(noStopsParser, "[[lazy dog] ]~4", 1, 3, 6);
+
+ noStopsParser.setAutoGeneratePhraseQueries(false);
+
+ // characters split into 2 tokens and treated as an "or" query
+ countSpansDocs(noStopsParser, "\u666E\u65AF", 2, 1);
+
+ // TODO: Not sure i like how this behaves.
+ // this is treated as [(\u666E \u6797 \u65AF \u987F \u5B66)]~2
+ // which is then simplified to just: (\u666E \u6797 \u65AF \u987F \u5B66)
+ // Probably better to be treated as [\u666E \u6797 \u65AF \u987F \u5B66]~2
+
+ testOffsetForSingleSpanMatch(noStopsParser,
+ "[\u666E\u6797\u65AF\u987F\u5B66]~1", 6, 0, 6);
+
+ SpanQueryParser stopsParser = new SpanQueryParser(FIELD, stopAnalyzer);
+ countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
+
+ // now test for throwing of exception
+ stopsParser.setThrowExceptionForStopWord(true);
+ boolean exc = false;
+ try {
+ countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
+ } catch (ParseException e) {
+ exc = true;
+ }
+ assertEquals(true, exc);
+ }
+
+ @Test
+ public void testRecursion() throws Exception {
+ /*
+ * For easy reference of expected offsets
+ *
+ * 0: eheu 1: fugaces 2: postume 3: postume 4: labuntur 5: anni 6: nec 7:
+ * pietas 8: moram 9: rugis 10: et 11: instanti 12: senectae 13: adferet 14:
+ * indomitaeque 15: morti
+ */
+ SpanQueryParser p = new SpanQueryParser(FIELD, noStopAnalyzer);
+
+ // String q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5";
+ // String q = "[pietas [rug?s senec*]!~2,0 ]~4";
+ // countSpansDocs(p, q, 1, 1);
+
+ // Span extents end at one more than the actual end, e.g.:
+ String q = "fugaces";
+ testOffsetForSingleSpanMatch(p, q, 5, 1, 2);
+
+ q = "morti";
+ testOffsetForSingleSpanMatch(p, q, 5, 15, 16);
+
+ q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~2";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // not near query for rugis senectae
+ q = "[labunt* [pietas [rug?s senec*]!~2 ]~4 adferet]~2";
+ countSpansDocs(p, q, 0, 0);
+
+ // not near query for rugis senectae, 0 before or 2 after
+ // Have to extend overall distance to 5 because hit for
+ // "rug?s senec*" matches only "rug?s" now
+ q = "[labunt* [pietas [rug?s senec*]!~2,0 ]~4 adferet]~5";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // not near query for rugis senectae, 0 before or 2 intervening
+ q = "[labunt* [pietas [rug?s senec*]!~0,2 ]~4 adferet]~5";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // not near query for rugis senectae, 0 before or 3 intervening
+ q = "[labunt* [pietas [rug?s senec*]!~0,3 ]~4 adferet]~2";
+ countSpansDocs(p, q, 0, 0);
+
+ // directionality specified
+ q = "[labunt* [pietas [rug?s senec*]~>2 ]~>4 adferet]~>2";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // no directionality, query order inverted
+ q = "[adferet [ [senec* rug?s ]~2 pietas ]~4 labunt*]~2";
+ testOffsetForSingleSpanMatch(p, q, 5, 4, 14);
+
+ // more than one word intervenes btwn rugis and senectae
+ q = "[labunt* [pietas [rug?s senec*]~1 ]~4 adferet]~2";
+ countSpansDocs(p, q, 0, 0);
+
+ // more than one word intervenes btwn labuntur and pietas
+ q = "[labunt* [pietas [rug?s senec*]~2 ]~4 adferet]~1";
+ countSpansDocs(p, q, 0, 0);
+ }
+
+ private void countSpansDocs(SpanQueryParser p, String s, int spanCount,
+ int docCount) throws Exception {
+ SpanQuery q = p.parse(s);
+ assertEquals("spanCount: " + s, spanCount, countSpans(q));
+ assertEquals("docCount: " + s, docCount, countDocs(q));
+
+ }
+
+ private long countSpans(SpanQuery q) throws Exception {
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ q = (SpanQuery) q.rewrite(ctx.reader());
+ Spans spans = q.getSpans(ctx, null, new HashMap());
+
+ long i = 0;
+ while (spans.next()) {
+ i++;
+ }
+ return i;
+ }
+
+ private long countDocs(SpanQuery q) throws Exception {
+ OpenBitSet docs = new OpenBitSet();
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ IndexReaderContext parentCtx = reader.getContext();
+ q = (SpanQuery) q.rewrite(ctx.reader());
+
+ Set qTerms = new HashSet();
+ q.extractTerms(qTerms);
+ Map termContexts = new HashMap();
+
+ for (Term t : qTerms) {
+ TermContext c = TermContext.build(parentCtx, t);
+ termContexts.put(t, c);
+ }
+
+ Spans spans = q.getSpans(ctx, null, termContexts);
+
+ while (spans.next()) {
+ docs.set(spans.doc());
+ }
+ long spanDocHits = docs.cardinality();
+ // double check with a regular searcher
+ TotalHitCountCollector coll = new TotalHitCountCollector();
+ searcher.search(q, coll);
+ assertEquals(coll.getTotalHits(), spanDocHits);
+ return spanDocHits;
+
+ }
+
+ private void testOffsetForSingleSpanMatch(SpanQueryParser p, String s,
+ int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception {
+ SpanQuery q = p.parse(s);
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ q = (SpanQuery) q.rewrite(ctx.reader());
+ Spans spans = q.getSpans(ctx, null, new HashMap());
+
+ int i = 0;
+ int spanStart = -1;
+ int spanEnd = -1;
+ int docID = -1;
+ while (spans.next()) {
+ spanStart = spans.start();
+ spanEnd = spans.end();
+ docID = spans.doc();
+ i++;
+ }
+ assertEquals("should only be one matching span", 1, i);
+ assertEquals("doc id", trueDocID, docID);
+ assertEquals("span start", trueSpanStart, spanStart);
+ assertEquals("span end", trueSpanEnd, spanEnd);
+ }
+
+ /**
+ * tests the parser's ability to correctly identify and build an individual
+ * single/multi-term query
+ */
+ @Test
+ public void testQueryTermTypeParserBasic() throws Exception {
+ Analyzer analyzer = new MockAnalyzer(new Random());
+ SpanQueryParser p = new SpanQueryParser(FIELD, analyzer);
+ SpanQuery q = p.buildAnyTermQuery("/f.*/");
+ Term t = new Term(FIELD, "f.*");
+ SpanQuery ex = new SpanMultiTermQueryWrapper(
+ new RegexpQuery(t));
+ assertEquals("regexp", ex, q);
+
+ q = p.buildAnyTermQuery("fox");
+ t = new Term(FIELD, "fox");
+ ex = new SpanTermQuery(t);
+ assertEquals("basic term", ex, q);
+
+ p.setFuzzyMinSim(0.6f);
+ q = p.buildAnyTermQuery("fox~0.8");
+ t = new Term(FIELD, "fox");
+ ex = new SpanMultiTermQueryWrapper(new SlowFuzzyQuery(t,
+ 0.8f));
+ assertEquals("fuzzy", ex.toString(), q.toString());
+
+ // test rounding for fuzzy > 1.0
+ p.setFuzzyMaxEdits(4);
+ q = p.buildAnyTermQuery("fox~3.3");
+ t = new Term(FIELD, "fox");
+ ex = new SpanMultiTermQueryWrapper(new SlowFuzzyQuery(t,
+ 3.0f));
+ assertEquals("fuzzy", ex.toString(), q.toString());
+
+ q = p.buildAnyTermQuery("fo*");
+ t = new Term(FIELD, "fo");
+ assertEquals("prefix *", new SpanMultiTermQueryWrapper(
+ new PrefixQuery(t)), q);
+
+ q = p.buildAnyTermQuery("fo?");
+ t = new Term(FIELD, "fo?");
+ assertEquals("prefix looking ?, but actually wildcard",
+ new SpanMultiTermQueryWrapper(new WildcardQuery(t)), q);
+
+ q = p.buildAnyTermQuery("f*x");
+ t = new Term(FIELD, "f*x");
+ assertEquals("wildcard *", new SpanMultiTermQueryWrapper(
+ new WildcardQuery(t)), q);
+
+ q = p.buildAnyTermQuery("f?x");
+ t = new Term(FIELD, "f?x");
+ assertEquals("wildcard ?", new SpanMultiTermQueryWrapper(
+ new WildcardQuery(t)), q);
+
+ q = p.buildAnyTermQuery("f?x*");
+ t = new Term(FIELD, "f?x*");
+ assertEquals("wild card * and ?",
+ new SpanMultiTermQueryWrapper(new WildcardQuery(t)), q);
+
+ boolean exc = false;
+ try {
+ q = p.buildAnyTermQuery("f*x~0.8");
+ } catch (ParseException e) {
+ if (e
+ .getMessage()
+ .equals(
+ "Can't have a single term in a query that is both a wildcard and a fuzzy query")) {
+ exc = true;
+ }
+ }
+ assertTrue(exc);
+
+ }
+
+ /**
+ * tests the parser's ability to correctly identify and build an individual
+ * single/multi-term query with escaped characters
+ */
+ public void testQueryTermTypeParserEscapes() throws Exception {
+ Analyzer analyzer = new MockAnalyzer(new Random());
+ SpanQueryParser p = new SpanQueryParser(FIELD, analyzer);
+
+ SpanQuery q = p.buildAnyTermQuery("fox\\~0.8");
+ Term t = new Term(FIELD, "fox\\~0.8");
+ assertEquals("fuzzy escaped, actually term", new SpanTermQuery(t), q);
+
+ q = p.buildAnyTermQuery("f\\?x*");
+ t = new Term(FIELD, "f\\?x");
+ assertEquals("actually prefix", new SpanMultiTermQueryWrapper(
+ new PrefixQuery(t)), q);
+
+ q = p.buildAnyTermQuery("f\\?x");
+ t = new Term(FIELD, "f\\?x");
+ assertEquals("escaped ?", new SpanTermQuery(t), q);
+
+ q = p.buildAnyTermQuery("f\\*x");
+ t = new Term(FIELD, "f\\*x");
+ assertEquals("escaped *", new SpanTermQuery(t), q);
+
+ }
+
+ /**
+ * Mocks StandardAnalyzer for tokenizing Chinese characters (at least for
+ * these test cases into individual tokens).
+ *
+ */
+ private final static class MockStandardTokenizerFilter extends TokenFilter {
+ // Only designed to handle test cases. You may need to modify this
+ // if adding new test cases. Note that position increment is hardcoded to be
+ // 1!!!
+ private final Pattern hackCJKPattern = Pattern
+ .compile("([\u5900-\u9899])|([\\p{InBasic_Latin}]+)");
+ private List buffer = new LinkedList();
+
+ private final CharTermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+
+ public MockStandardTokenizerFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws java.io.IOException {
+ if (buffer.size() > 0) {
+ termAtt.setEmpty().append(buffer.remove(0));
+ posIncrAtt.setPositionIncrement(1);
+ return true;
+ } else {
+ boolean next = input.incrementToken();
+ if (!next) {
+ return false;
+ }
+ // posIncrAtt.setPositionIncrement(1);
+ String text = termAtt.toString();
+ Matcher m = hackCJKPattern.matcher(text);
+ boolean hasCJK = false;
+ while (m.find()) {
+ if (m.group(1) != null) {
+ hasCJK = true;
+ buffer.add(m.group(1));
+ } else if (m.group(2) != null) {
+ buffer.add(m.group(2));
+ }
+ }
+ if (hasCJK == false) {
+ // don't change the position increment, the super class will handle
+ // stop words properly
+ buffer.clear();
+ return true;
+ }
+ if (buffer.size() > 0) {
+ termAtt.setEmpty().append(buffer.remove(0));
+ posIncrAtt.setPositionIncrement(1);
+ }
+ return true;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ }
+ }
+}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserRewriteMethodTest.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserRewriteMethodTest.java (revision 0)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/span/SpanQueryParserRewriteMethodTest.java (revision 0)
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span;
+
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.queryparser.span.SpanQueryParser;
+import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.TotalHitCountCollector;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util._TestUtil;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class SpanQueryParserRewriteMethodTest extends LuceneTestCase {
+
+ private static IndexReader reader;
+ private static IndexSearcher searcher;
+ private static Directory directory;
+ private static Analyzer stopAnalyzer;
+ private static final String FIELD = "field";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+
+ stopAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE,
+ true);
+ TokenFilter filter = new MockTokenFilter(tokenizer,
+ MockTokenFilter.ENGLISH_STOPSET);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, stopAnalyzer)
+ .setMaxBufferedDocs(_TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] { "aaaaaaaaaaaaaaaaaaaaaaa",
+ "abaaaaaaaaaaaaaaaaaaaaa", "aabaaaaaaaaaaaaaaaaaaaa",
+ "aaabaaaaaaaaaaaaaaaaaaa", "aaaabaaaaaaaaaaaaaaaaaa",
+ "aaaaabaaaaaaaaaaaaaaaaa", "aaaaaabaaaaaaaaaaaaaaaa",
+ "aaaaaaabaaaaaaaaaaaaaaa", "aaaaaaaabaaaaaaaaaaaaaa",
+ "aaaaaaaaabaaaaaaaaaaaaa", "aaaaaaaaaabaaaaaaaaaaaa",
+ "aaaaaaaaaaabaaaaaaaaaaa", "aaaaaaaaaaaabaaaaaaaaaa",
+ "aaaaaaaaaaaaabaaaaaaaaa", "aaaaaaaaaaaaaabaaaaaaaa",
+ "aaaaaaaaaaaaaaabaaaaaaa", "aaaaaaaaaaaaaaaabaaaaaa", };
+ for (int i = 0; i < docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD, docs[i], Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcher = new IndexSearcher(reader);
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ directory = null;
+ stopAnalyzer = null;
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+
+ SpanQueryParser p = new SpanQueryParser(FIELD, stopAnalyzer);
+ int maxExpansions = 5;
+ // this works on prefix, wildcard, fuzzy and regex
+ // it has no effect on max number of boolean clauses in SpanOr
+ p.setMultiTermRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(
+ 5));
+
+ countSpansDocs(p, "a*", 5, 5);
+ countSpansDocs(p, "a*a", 5, 5);
+ countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa~1", 5, 5);
+ countSpansDocs(p, "/a.*/", 5, 5);
+ countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa " + "abaaaaaaaaaaaaaaaaaaaaa "
+ + "aabaaaaaaaaaaaaaaaaaaaa " + "aaabaaaaaaaaaaaaaaaaaaa "
+ + "aaaabaaaaaaaaaaaaaaaaaa " + "aaaaabaaaaaaaaaaaaaaaaa "
+ + "aaaaaabaaaaaaaaaaaaaaaa " + "aaaaaaabaaaaaaaaaaaaaaa "
+ + "aaaaaaaabaaaaaaaaaaaaaa " + "aaaaaaaaabaaaaaaaaaaaaa", 10, 10);
+
+ // this has no effect whatsoever as of this writing.
+ p.setMultiTermRewriteMethod(new SpanMultiTermQueryWrapper.TopTermsSpanBooleanQueryRewrite(
+ maxExpansions));
+
+ countSpansDocs(p, "a*", 17, 17);
+ countSpansDocs(p, "a*a", 17, 17);
+
+ countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa~1", 17, 17);
+ countSpansDocs(p, "/a.*/", 17, 17);
+ countSpansDocs(p, "aaaaaaaaaaaaaaaaaaaaaaa " + "abaaaaaaaaaaaaaaaaaaaaa "
+ + "aabaaaaaaaaaaaaaaaaaaaa " + "aaabaaaaaaaaaaaaaaaaaaa "
+ + "aaaabaaaaaaaaaaaaaaaaaa " + "aaaaabaaaaaaaaaaaaaaaaa "
+ + "aaaaaabaaaaaaaaaaaaaaaa " + "aaaaaaabaaaaaaaaaaaaaaa "
+ + "aaaaaaaabaaaaaaaaaaaaaa " + "aaaaaaaaabaaaaaaaaaaaaa", 10, 10);
+
+ }
+
+ private void countSpansDocs(SpanQueryParser p, String s, int spanCount,
+ int docCount) throws Exception {
+ SpanQuery q = p.parse(s);
+ assertEquals("spanCount: " + s, spanCount, countSpans(q));
+ assertEquals("docCount: " + s, docCount, countDocs(q));
+
+ }
+
+ private long countSpans(SpanQuery q) throws Exception {
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ q = (SpanQuery) q.rewrite(ctx.reader());
+
+ Spans spans = q.getSpans(ctx, null, new HashMap());
+
+ long i = 0;
+ while (spans.next()) {
+ i++;
+ }
+ return i;
+ }
+
+ private long countDocs(SpanQuery q) throws Exception {
+ OpenBitSet docs = new OpenBitSet();
+ List ctxs = reader.leaves();
+ assert (ctxs.size() == 1);
+ AtomicReaderContext ctx = ctxs.get(0);
+ IndexReaderContext parentCtx = reader.getContext();
+ q = (SpanQuery) q.rewrite(ctx.reader());
+
+ Set qTerms = new HashSet();
+ q.extractTerms(qTerms);
+ Map termContexts = new HashMap();
+
+ for (Term t : qTerms) {
+ TermContext c = TermContext.build(parentCtx, t);
+ termContexts.put(t, c);
+ }
+
+ Spans spans = q.getSpans(ctx, null, termContexts);
+
+ while (spans.next()) {
+ docs.set(spans.doc());
+ }
+ long spanDocHits = docs.cardinality();
+ // double check with a regular searcher
+ TotalHitCountCollector coll = new TotalHitCountCollector();
+ searcher.search(q, coll);
+ assertEquals(coll.getTotalHits(), spanDocHits);
+ return spanDocHits;
+
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfoBuilder.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfoBuilder.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfoBuilder.java (revision 0)
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryparser.span.clauses;
+
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.span.clauses.ClauseInfo.TYPE;
+
+/**
+ * Used internally by SpanQueryParserUtil to build a clause
+ *
+ */
+public class ClauseInfoBuilder {
+
+ public static ClauseInfo build(ClauseInfo.START_OR_END startOrEnd, TYPE type,
+ int start, int end) throws ParseException {
+ if (type.equals(TYPE.OR)) {
+ return new ClauseInfo(startOrEnd, start, end);
+ } else if (type.equals(TYPE.NEAR)) {
+ return new SpanNearClauseInfo(startOrEnd, start, end);
+ } else if (type.equals(TYPE.NOT_NEAR)) {
+ return new SpanNotNearClauseInfo(startOrEnd, start, end);
+ }
+ throw new ParseException(String.format(
+ "I'm sorry, but I don't recognize this type: %s", type));
+ }
+
+ public static ClauseInfo build(ClauseInfo.START_OR_END startOrEnd, int start,
+ int end, int slop, boolean inOrder) {
+ return new SpanNearClauseInfo(startOrEnd, start, end, slop, inOrder);
+ }
+
+ public static ClauseInfo build(ClauseInfo.START_OR_END startOrEnd, int start,
+ int end, int pre, int post) {
+ return new SpanNotNearClauseInfo(startOrEnd, start, end, pre, post);
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNearClauseInfo.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNearClauseInfo.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNearClauseInfo.java (revision 0)
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryparser.span.clauses;
+
+public class SpanNearClauseInfo extends ClauseInfo {
+ private final static TYPE type = TYPE.NEAR;
+ private final static int DEFAULT_SLOP = 0;
+ private final static boolean DEFAULT_INORDER = true;
+
+ private final boolean inOrder;
+ private final int slop;
+
+ public SpanNearClauseInfo(START_OR_END which, int start, int end) {
+ super(which, start, end);
+ this.inOrder = DEFAULT_INORDER;
+ this.slop = DEFAULT_SLOP;
+ }
+
+ public SpanNearClauseInfo(START_OR_END which, int start, int end, int slop,
+ boolean inOrder) {
+ super(which, start, end);
+ this.slop = slop;
+ this.inOrder = inOrder;
+ }
+
+ public int getSlop() {
+ return slop;
+ }
+
+ public boolean getInOrder() {
+ return inOrder;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNotNearClauseInfo.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNotNearClauseInfo.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/SpanNotNearClauseInfo.java (revision 0)
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryparser.span.clauses;
+
+public class SpanNotNearClauseInfo extends ClauseInfo {
+ private final static int DEFAULT_BEFORE = 0;
+ private final static int DEFAULT_AFTER = 0;
+
+ private final static TYPE type = TYPE.NOT_NEAR;
+
+ private final int before;
+ private final int after;
+
+ public SpanNotNearClauseInfo(START_OR_END which, int start, int end) {
+ super(which, start, end);
+ this.before = DEFAULT_BEFORE;
+ this.after = DEFAULT_AFTER;
+ }
+
+ public SpanNotNearClauseInfo(START_OR_END which, int start, int end, int slop) {
+ super(which, start, end);
+ this.before = slop;
+ this.after = slop;
+ }
+
+ public SpanNotNearClauseInfo(START_OR_END which, int start, int end,
+ int before, int after) {
+ super(which, start, end);
+ this.before = before;
+ this.after = after;
+ }
+
+ public int getBefore() {
+ return before;
+ }
+
+ public int getAfter() {
+ return after;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfo.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfo.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/clauses/ClauseInfo.java (revision 0)
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span.clauses;
+
+/**
+ * Type of clause: or, near or not near
+ *
+ */
+public class ClauseInfo {
+ public enum START_OR_END {
+ START, END
+ };
+
+ public enum TYPE {
+ OR, // spanor
+ NEAR, // spannear
+ NOT_NEAR // spannotnear
+ };
+
+ private final int start;
+ private final int end;
+ private final START_OR_END startOrEnd;
+ private final static TYPE type = TYPE.OR;
+
+ public ClauseInfo(START_OR_END which, int start, int end) {
+ this.startOrEnd = which;
+ this.start = start;
+ this.end = end;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public START_OR_END getStartOrEnd() {
+ return startOrEnd;
+ }
+
+ public static boolean matchTypes(TYPE startType, TYPE endType) {
+ // this is effectively directional!
+ if (startType.equals(endType)) {
+ return true;
+ }
+ if (startType.equals(TYPE.NEAR)
+ && (endType.equals(TYPE.NEAR) || endType.equals(TYPE.NOT_NEAR))) {
+ return true;
+ }
+ return false;
+ }
+
+ public static boolean matchOpenClose(ClauseInfo openMarker,
+ ClauseInfo closeMarker) {
+ if (openMarker.getStartOrEnd().equals(START_OR_END.START)
+ && closeMarker.getStartOrEnd().equals(START_OR_END.END)
+ && matchTypes(openMarker.getType(), closeMarker.getType())) {
+ return true;
+ }
+
+ return false;
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserBase.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserBase.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserBase.java (revision 0)
@@ -0,0 +1,899 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
+import org.apache.lucene.queryparser.span.clauses.ClauseInfo;
+import org.apache.lucene.queryparser.span.clauses.SpanNearClauseInfo;
+import org.apache.lucene.queryparser.span.clauses.SpanNotNearClauseInfo;
+import org.apache.lucene.queryparser.span.clauses.ClauseInfo.TYPE;
+import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+/**
+ * Following the lead of {@link QueryParserBase}, this separates most of the
+ * functionality for creating a {@link SpanQuery} from the actual parser. This
+ * should allow for easy additions of niftier jj or antlr or ?? parsers than the
+ * current SpanQueryParser.
+ *
+ */
+public abstract class SpanQueryParserBase {
+
+ private final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
+
+ private NormMultiTerm normMultiTerm = NormMultiTerm.LOWERCASE;
+ private MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
+ private boolean allowLeadingWildcard = false;
+
+ private Analyzer analyzer;
+ private String field;
+ private int phraseSlop = 0;
+ private float fuzzyMinSim = 0.6f;// instead of FuzzyQuery.defaultMinSimilarity
+ // 2.0f;
+ private int fuzzyMaxEdits = FuzzyQuery.defaultMaxEdits;
+ private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
+ private int spanNearMaxDistance = 100;
+ private int spanNotNearMaxDistance = 50;
+ private boolean throwExceptionForStopWord = false;
+ private Locale locale = Locale.getDefault();
+
+ boolean autoGeneratePhraseQueries = true;
+
+ /**
+ * Initializes a span query parser. Called by the SpanQueryParser constructor
+ *
+ * @param f
+ * the field for query terms.
+ * @param a
+ * used to find (and normalize) terms in the query text.
+ */
+ public void init(String f, Analyzer a) {
+ analyzer = a;
+ field = f;
+ }
+
+ /**
+ * Parsers must implement this.
+ *
+ * @param s
+ * @return {@link SpanQuery} or null if nothing could be parsed
+ * @throws ParseException
+ */
+ public abstract SpanQuery parse(String s) throws ParseException;
+
+ /**
+ * Not currently called by parser
+ *
+ * @param s
+ * @return
+ */
+ public static String escape(String s) {
+ return QueryParserBase.escape(s);
+ }
+
+ /**
+ * @see #setMultiTermRewriteMethod(org.apache.lucene.search.MultiTermQuery.RewriteMethod)
+ * @return
+ */
+ public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() {
+ return multiTermRewriteMethod;
+ }
+
+ /**
+ *
+ * By default QueryParser uses
+ * {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
+ * when creating a {@link PrefixQuery}, {@link WildcardQuery},
+ * {@link TermRangeQuery}, and {@link RegexpQuery}. This implementation is
+ * generally preferable because it a) Runs faster b) Does not have the
+ * scarcity of terms unduly influence score c) avoids any
+ * {@link TooManyClauses} exception.
+ *
+ * To set the max number of rewrites to a number higher than the default, use:
+ * MultiTermQuery.TopTermsScoringBooleanQueryRewrite(x) or similar.
+ *
+ * Beware: as of this writing,
+ * SpanMultiTermQueryWrapper.TopTermsSpanBooleanQueryRewrite(maxExpansions)
+ * has no effect.
+ */
+ public void setMultiTermRewriteMethod(
+ MultiTermQuery.RewriteMethod multiTermRewriteMethod) {
+ this.multiTermRewriteMethod = multiTermRewriteMethod;
+ }
+
+ /**
+ * @see #setAllowLeadingWildcard(boolean)
+ * @return
+ */
+ public boolean getAllowLeadingWildcard() {
+ return allowLeadingWildcard;
+ }
+
+ /**
+ * Set to true to allow leading wildcard characters.
+ *
+ * When set, * or ? are allowed as the first
+ * character of a PrefixQuery and WildcardQuery. Note that this can produce
+ * very slow queries on big indexes.
+ *
+ * Default: false.
+ */
+ public void setAllowLeadingWildcard(boolean allowLeadingWildcard) {
+ this.allowLeadingWildcard = allowLeadingWildcard;
+ }
+
+ /**
+ * @see #setAnalyzer(Analyzer)
+ * @return
+ */
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ /**
+ *
+ * @param analyzer
+ * set analyzer to use in parser
+ */
+ public void setAnalyzer(Analyzer analyzer) {
+ this.analyzer = analyzer;
+ }
+
+ /**
+ * @see #setField(String)
+ * @return
+ */
+ public String getField() {
+ return field;
+ }
+
+ /**
+ *
+ * @param field
+ * which field to parse
+ */
+ public void setField(String field) {
+ this.field = field;
+ }
+
+ /**
+ * @see #setPhraseSlop(int)
+ * @return
+ */
+ public int getPhraseSlop() {
+ return phraseSlop;
+ }
+
+ /**
+ *
+ * @param phraseSlop
+ * default slop to use in phrases
+ */
+ public void setPhraseSlop(int phraseSlop) {
+ this.phraseSlop = phraseSlop;
+ }
+
+ /**
+ * @see #setFuzzyMinSim(float)
+ * @return
+ */
+ public float getFuzzyMinSim() {
+ return fuzzyMinSim;
+ }
+
+ /**
+ * For a fuzzy query, if the fuzzy value is < 1.0f this will be
+ * the minimum allowable similarity. For example, if this is set to 0.8f and a
+ * query of salmonella~0.6 is parsed, the resulting query will be for
+ * "salmonella" with a fuzziness of 0.8f. Default is 0.6f.
+ *
+ * However, if the fuzzy value is >= 1.0f, then @see
+ * #setFuzzyMaxEdits(int).
+ *
+ * @param fuzzyMinSim
+ */
+ public void setFuzzyMinSim(float fuzzyMinSim) {
+ this.fuzzyMinSim = fuzzyMinSim;
+ }
+
+ /**
+ * @see #setFuzzyPrefixLength(int)
+ * @return
+ */
+ public int getFuzzyPrefixLength() {
+ return fuzzyPrefixLength;
+ }
+
+ /**
+ *
+ * @param fuzzyPrefixLength
+ * prefix length to use in fuzzy queries. Default is 0.
+ */
+ public void setFuzzyPrefixLength(int fuzzyPrefixLength) {
+ this.fuzzyPrefixLength = fuzzyPrefixLength;
+ }
+
+ /**
+ * @see #setFuzzyMaxEdits(int)
+ * @return
+ */
+ public int getFuzzyMaxEdits() {
+ return fuzzyMaxEdits;
+ }
+
+ /**
+ * maximum number of edits allowed in a fuzzy query. BEWARE if this is
+ * set to anything greater than 2, you'll be out of Automaton land and into
+ * brute force land (vintage Lucene <=3.x) This could wreak serious
+ * performance problems.
+ *
+ * @param fuzzyMaxEdits
+ */
+ public void setFuzzyMaxEdits(int fuzzyMaxEdits) {
+ this.fuzzyMaxEdits = fuzzyMaxEdits;
+ }
+
+ /**
+ * @see #setLocale(Locale)
+ * @return
+ */
+ public Locale getLocale() {
+ return locale;
+ }
+
+ /**
+ * So far, only used in lowercasing of multiterm queries.
+ *
+ * @param locale
+ */
+ public void setLocale(Locale locale) {
+ this.locale = locale;
+ }
+
+ public boolean getAutoGeneratePhraseQueries() {
+ return autoGeneratePhraseQueries;
+ }
+
+ public void setAutoGeneratePhraseQueries(boolean b) {
+ autoGeneratePhraseQueries = b;
+ }
+
+ /**
+ * When the parser comes across a simple single term, it runs the term through
+ * the analyzer. This is called by
+ * {@link #buildSingleTermQuery(String, String). Override this for custom
+ * handling. In whitespace languages, the returned array will most likely have
+ * a length of 1. It can have a length of 0 if a stop word was passed in as
+ * termText; The array may contain null values if the {@link Analyzer} breaks
+ * the apparently single term into multiple terms and there is a stopword. To
+ * identify stop words, the Analyzer must have a
+ * {@link PositionIncrementAttribute}. If it doesn't, this will silently hide
+ * the nulls and not add them to the array.
+ *
+ * @param termText
+ * apparently simple single term
+ * @return array of {@link String} for the parts that the analyzer broke this
+ * into.
+ *
+ */
+ protected String[] analyzeSimpleSingleTerm(String termText)
+ throws ParseException {
+ List chunks = new LinkedList();
+ try {
+ TokenStream ts = analyzer.tokenStream(field, termText);
+ ts.reset();
+ CharTermAttribute cAttr = ts.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posAttr = null;
+ if (ts.hasAttribute(PositionIncrementAttribute.class)) {
+ posAttr = ts.getAttribute(PositionIncrementAttribute.class);
+ }
+ while (ts.incrementToken()) {
+ chunks.add(cAttr.toString());
+ if (posAttr != null) {
+ for (int i = 1; i < posAttr.getPositionIncrement(); i++) {
+ chunks.add(null);
+ }
+ }
+ }
+ ts.end();
+ ts.close();
+ } catch (IOException e) {
+ throw new ParseException(
+ String.format(getLocale(),
+ "IOException while trying to parse %s : %s", termText,
+ e.getMessage()));
+ }
+
+ return chunks.toArray(new String[chunks.size()]);
+ }
+
+ protected SpanQuery buildRegexTermQuery(String field, String termText) {
+ RegexpQuery regexQuery = new RegexpQuery(new Term(field, termText));
+ regexQuery.setRewriteMethod(multiTermRewriteMethod);
+ return new SpanMultiTermQueryWrapper(regexQuery);
+ }
+
+ /**
+ *
+ * @param field
+ * @param termText
+ * @param slop
+ * if < 1.0f, this will be treated as the old minSim, if >= 1.0f,
+ * this will be rounded and treated as maxEdits
+ * @return
+ * @throws ParseException
+ */
+ protected SpanQuery buildFuzzyTermQuery(String field, String termText,
+ float slop) throws ParseException {
+ String normalized = termText;
+
+ switch (normMultiTerm) {
+ case NO_NORM:
+ break;
+ case LOWERCASE:
+ normalized = normalized.toLowerCase(getLocale());
+ break;
+ case ANALYZE:
+ normalized = analyzeSingleChunk(field, termText, normalized);
+ break;
+ }
+
+ if (slop == 0.0f) {
+ return new SpanTermQuery(new Term(field, normalized));
+ }
+ // if the user enters 2.4 for example, round it so that there won't be an
+ // illegalparameter exception
+ if (slop >= 1.0f) {
+ slop = (float) Math.round(slop);
+ }
+
+ // set the max slop
+ if (slop < 1.0f && slop < fuzzyMinSim) {
+ slop = fuzzyMinSim;
+ } else if (slop > 1.0f && slop > fuzzyMaxEdits) {
+ slop = fuzzyMaxEdits;
+ }
+ // SlowFuzzyQuery defaults to the Automaton if maxEdits is <= 2.
+ // We don't have to reinvent that wheel.
+ SlowFuzzyQuery fuzzyQuery = new SlowFuzzyQuery(new Term(field, normalized),
+ slop, fuzzyPrefixLength);
+ fuzzyQuery.setRewriteMethod(multiTermRewriteMethod);
+
+ return new SpanMultiTermQueryWrapper(fuzzyQuery);
+
+ }
+
+ /**
+ * @param field
+ * @param termText
+ * should literally be the prefix. It should not end with *.
+ * @return
+ * @throws ParseException
+ */
+ protected SpanQuery buildPrefixQuery(String field, String termText)
+ throws ParseException {
+ // could happen with a simple * query
+ testLeadingWildcard(termText);
+
+ String normalized = termText;
+
+ switch (normMultiTerm) {
+ case NO_NORM:
+ break;
+ case LOWERCASE:
+ normalized = normalized.toLowerCase(locale);
+ break;
+ case ANALYZE:
+ normalized = analyzeSingleChunk(field, termText, normalized);
+ break;
+ }
+
+ PrefixQuery query = new PrefixQuery(new Term(field, normalized));
+ query.setRewriteMethod(multiTermRewriteMethod);
+ return new SpanMultiTermQueryWrapper(query);
+
+ }
+
+ protected SpanQuery buildWildcardQuery(String field, String termText)
+ throws ParseException {
+ testLeadingWildcard(termText);
+ String normalized = termText;
+
+ switch (normMultiTerm) {
+ case NO_NORM:
+ break;
+ case LOWERCASE:
+ normalized = normalized.toLowerCase(locale);
+ break;
+ case ANALYZE:
+ normalized = analyzeWildcard(field, termText);
+ break;
+ }
+ WildcardQuery wildcardQuery = new WildcardQuery(new Term(field, normalized));
+ wildcardQuery.setRewriteMethod(multiTermRewriteMethod);
+ return new SpanMultiTermQueryWrapper(wildcardQuery);
+
+ }
+
+ /**
+ * build what appears to be a simple single term query. If the analyzer breaks
+ * it into multiple terms, treat that as a "phrase" or as an "or" depending on
+ * the value of {@link #autoGeneratePhraseQueries}.
+ *
+ * @param field
+ * @param termText
+ * @return
+ * @throws ParseException
+ */
+ protected SpanQuery buildSingleTermQuery(String field, String termText)
+ throws ParseException {
+ String[] terms = analyzeSimpleSingleTerm(termText);
+ if (terms.length == 0) {
+ if (throwExceptionForStopWord) {
+ throw new ParseException(
+ String
+ .format(
+ "No terms returned after I tried to normalize what I thought was a single term: %s",
+ termText));
+ } else {
+ return getEmptyQuery();
+ }
+ } else if (terms.length == 1) {
+ return new SpanTermQuery(new Term(field, terms[0]));
+ } else {
+ // if the analyzer broke this into more than one term,
+ // treat it as a boolean query or as a
+ // span near query with no slop and in order
+ // depending on the value of autoGeneratePhraseQueries
+
+ List nonEmpties = new LinkedList();
+ for (String piece : terms) {
+ if (piece != null) {
+ nonEmpties.add(new SpanTermQuery(new Term(field, piece)));
+ } else if (piece == null && throwExceptionForStopWord) {
+ throw new ParseException("Stop word found in " + termText);
+ }
+ }
+
+ if (nonEmpties.size() == 0) {
+ return getEmptyQuery();
+ }
+ if (nonEmpties.size() == 1) {
+ return nonEmpties.get(0);
+ }
+
+ SpanQuery[] queries = nonEmpties
+ .toArray(new SpanQuery[nonEmpties.size()]);
+ if (getAutoGeneratePhraseQueries() == true) {
+ return new SpanNearQuery(queries, 0, true);
+ } else {
+ return new SpanOrQuery(queries);
+ }
+ }
+ }
+
+ /**
+ * Does this start with a wildcard and is that allowed?
+ *
+ * @param termText
+ * @throws ParseException
+ */
+ private void testLeadingWildcard(String termText) throws ParseException {
+ if (allowLeadingWildcard == false
+ && (termText.startsWith("*") || termText.startsWith("?"))) {
+ throw new ParseException(
+ "'*' or '?' not allowed as first character in WildcardQuery with current configuration.");
+ }
+ }
+
+ /**
+ * Will build a SpanQuery clause from components and the ClauseInfo
+ *
+ * @param clauses
+ * @param clauseInfo
+ * @return
+ * @throws ParseException
+ */
+ protected SpanQuery buildQuery(List clauses, ClauseInfo clauseInfo)
+ throws ParseException {
+
+ if (clauseInfo.getType().equals(TYPE.OR)) {
+ return buildSpanOrQuery(clauses);
+ } else if (clauseInfo.getType().equals(TYPE.NEAR)) {
+ SpanNearClauseInfo tmp = (SpanNearClauseInfo) clauseInfo;
+ return buildSpanNearQuery(clauses, tmp.getSlop(), tmp.getInOrder());
+ } else if (clauseInfo.getType().equals(TYPE.NOT_NEAR)) {
+ SpanNotNearClauseInfo tmp = (SpanNotNearClauseInfo) clauseInfo;
+ return buildSpanNotNearQuery(clauses, tmp.getBefore(), tmp.getAfter());
+ }
+ throw new ParseException(
+ String
+ .format("I don't know how to build a query for a clause of type: %s"
+ + clauseInfo.getType()));
+ }
+
+ /**
+ *
+ * @param clauses
+ * @return {@link SpanOrQuery} might be empty if clauses is null or contains
+ * only empty queries
+ */
+ protected SpanQuery buildSpanOrQuery(List clauses) {
+ if (clauses == null || clauses.size() == 0)
+ return getEmptyQuery();
+
+ List nonEmpties = removeEmpties(clauses);
+ if (nonEmpties.size() == 0) {
+ return getEmptyQuery();
+ }
+ if (nonEmpties.size() == 1)
+ return nonEmpties.get(0);
+
+ SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]);
+ return new SpanOrQuery(arr);
+
+ }
+
+ /**
+ *
+ * @param clauses
+ * @return {@link SpanOrQuery} or null if clauses is null or empty
+ */
+ protected SpanQuery buildSpanNearQuery(List clauses, int slop,
+ boolean inOrder) {
+ if (clauses == null || clauses.size() == 0)
+ return getEmptyQuery();
+
+ List nonEmpties = removeEmpties(clauses);
+
+ if (nonEmpties.size() == 0) {
+ return getEmptyQuery();
+ }
+
+ if (slop > spanNearMaxDistance) {
+ slop = spanNearMaxDistance;
+ }
+
+ if (nonEmpties.size() == 1) {
+ return specialHandlingForSpanNearWithOneComponent(nonEmpties.get(0),
+ slop, inOrder);
+ }
+
+ SpanQuery[] arr = nonEmpties.toArray(new SpanQuery[nonEmpties.size()]);
+ return new SpanNearQuery(arr, slop, inOrder);
+ }
+
+ /**
+ * This is meant to "fix" two cases that might be surprising to a
+ * non-whitespace language speaker. If a user entered, e.g. "\u5927\u5B66"~3,
+ * and {@link #autoGeneratePhraseQueries} is set to true, then the parser
+ * would treat this recursively and yield [[\u5927\u5B66]]~3. The user
+ * probably meant find those two characters within three words of each other,
+ * not find those right next to each other and that hit has to be within three
+ * words of nothing.
+ *
+ * If a user entered the same thing and {@link #autoGeneratePhraseQueries} is
+ * set to false, then the parser would treat this as [(\u5927\u5B66)]~3: find
+ * one character or the other and then that hit has to be within three words
+ * of nothing.
+ *
+ * This special handling does create incorrect handling for whitespace
+ * languages. [[quick fox]]~1 should only match on "quick fox" and it will now
+ * match on "fox green quick".
+ *
+ * The current method was chosen because the former use case is probably far
+ * more common than the latter.
+ *
+ * @param spanQuery
+ * this is the sole child of a SpanNearQuery
+ * @param slop
+ * slop from the parent
+ * @param inOrder
+ * inOrder from the parent
+ * @return
+ */
+ private SpanQuery specialHandlingForSpanNearWithOneComponent(
+ SpanQuery spanQuery, int slop, boolean inOrder) {
+
+ if (spanQuery instanceof SpanNearQuery && autoGeneratePhraseQueries == true) {
+ SpanNearQuery q = (SpanNearQuery) spanQuery;
+ if (q.isInOrder() && q.getSlop() == 0) {
+
+ SpanQuery[] children = q.getClauses();
+ // if the grandchildren aren't all SpanTermQueries
+ // then this can't be the edge case for the special handling
+ for (SpanQuery c : children) {
+ if (!(c instanceof SpanTermQuery)) {
+ return spanQuery;
+ }
+ }
+ return new SpanNearQuery(children, slop, inOrder);
+ }
+ } else if (spanQuery instanceof SpanOrQuery
+ && autoGeneratePhraseQueries == false) {
+ SpanOrQuery q = (SpanOrQuery) spanQuery;
+ SpanQuery[] children = q.getClauses();
+ for (SpanQuery c : children) {
+ if (!(c instanceof SpanTermQuery)) {
+ return spanQuery;
+ }
+ }
+ return new SpanNearQuery(children, slop, inOrder);
+ }
+ return spanQuery;
+ }
+
+ /**
+ *
+ * @param clauses
+ * @return {@link SpanOrQuery}
+ */
+ protected SpanQuery buildSpanNotNearQuery(List clauses, int pre,
+ int post) throws ParseException {
+ if (clauses.size() != 2) {
+ throw new ParseException(
+ String.format("SpanNotNear query must have two clauses. I count %d",
+ clauses.size()));
+ }
+ // if include is an empty query, treat this as just an empty query
+ if (isEmptyQuery(clauses.get(0))) {
+ return clauses.get(0);
+ }
+ // if exclude is an empty query, return include alone
+ if (isEmptyQuery(clauses.get(1))) {
+ return clauses.get(0);
+ }
+
+ if (pre > spanNotNearMaxDistance) {
+ pre = spanNotNearMaxDistance;
+ }
+ if (post > spanNotNearMaxDistance) {
+ post = spanNotNearMaxDistance;
+ }
+ return new SpanNotQuery(clauses.get(0), clauses.get(1), pre, post);
+ }
+
+ private List removeEmpties(List queries) {
+
+ List nonEmpties = new ArrayList();
+ for (SpanQuery q : queries) {
+ if (!isEmptyQuery(q)) {
+ nonEmpties.add(q);
+ }
+ }
+ return nonEmpties;
+ }
+
+ /**
+ * @see #setNormMultiTerm(NormMultiTerm)
+ * @return
+ */
+ public NormMultiTerm getNormMultiTerm() {
+ return normMultiTerm;
+ }
+
+ /**
+ * Set how the parser should analyze multiterms: {@link FuzzyQuery},
+ * {@link RegexpQuery}, {@link WildcardQuery}.
+ *
+ *
+ * Warning (copied from AnalyzingQueryParser):
+ * {@link NormMultiTerm.ANALZYE} should only be used with analyzers that do
+ * not use stopwords or that add tokens. Also, several stemming analyzers are
+ * inappropriate: for example, GermanAnalyzer will turn
+ * Häuser into hau, but H?user
+ * will become h?user when using this parser and thus no match
+ * would be found (i.e. using this parser will be no improvement over
+ * {@link NormMultiTerm.LOWERCASE} in such cases).
+ *
+ * @param normMultiTerm
+ */
+ public void setNormMultiTerm(NormMultiTerm normMultiTerm) {
+ this.normMultiTerm = normMultiTerm;
+ }
+
+ /**
+ * Sets an upper limit on the maximum distance for a not near query. If
+ * {@link #spanNotNearMaxDistance} is set to 10, and a query of "foo bar"~20,8
+ * is parsed, the returned query will be for a {@link SpanNotQuery} with a pre
+ * value of 10 and a post value of 8. The default is 50.
+ *
+ * @param dist
+ * the max distance
+ */
+ public void setSpanNotNearMaxDistance(int dist) {
+ assert (dist >= 0);
+ spanNotNearMaxDistance = dist;
+ }
+
+ /**
+ * @see #setSpanNearMaxDistance(int)
+ */
+ public int getSpanNotNearMaxDistance() {
+ return spanNotNearMaxDistance;
+ }
+
+ /**
+ * @see #setSpanNearMaxDistance(int)
+ */
+ public int getSpanNearMaxDistance() {
+ return spanNearMaxDistance;
+ }
+
+ /**
+ * Sets an upper limit on the maximum distance for a phrase/near query. If
+ * {@link #spanNearMaxDistance} is set to 10, and a query of "foo bar"~20 is
+ * parsed, the returned query will be for a {@link SpanNearQuery} with a
+ * distance of 10. The default is 100.
+ *
+ * @param dist
+ * the max distance
+ */
+ public void setSpanNearMaxDistance(int dist) {
+ assert (dist >= 0);
+ spanNearMaxDistance = dist;
+ }
+
+ /**
+ * @see #getThrowExceptionForStopWord()
+ */
+ public boolean getThrowExceptionForStopWord() {
+ return throwExceptionForStopWord;
+ }
+
+ /**
+ * If a stopword is encountered during parsing, should the parser throw a
+ * ParseException or ignore the stopword?
+ *
+ * @param toThrowOrNotToThrow
+ */
+ public void setThrowExceptionForStopWord(boolean toThrowOrNotToThrow) {
+ throwExceptionForStopWord = toThrowOrNotToThrow;
+ }
+
+ /**
+ * Returns the analyzed form for the given chunk
+ *
+ * If the analyzer produces more than one output token from the given chunk, a
+ * ParseException is thrown.
+ *
+ * @param field
+ * The target field
+ * @param termStr
+ * The full term from which the given chunk is excerpted
+ * @param chunk
+ * The portion of the given termStr to be analyzed
+ * @return The result of analyzing the given chunk
+ * @throws ParseException
+ * when analysis returns other than one output token
+ */
+ protected String analyzeSingleChunk(String field, String termStr, String chunk)
+ throws ParseException {
+ // plagiarized from AnalyzingQueryParser
+ String analyzed = null;
+ TokenStream stream = null;
+ try {
+ stream = getAnalyzer().tokenStream(field, chunk);
+ stream.reset();
+ CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
+ // get first and hopefully only output token
+ if (stream.incrementToken()) {
+ analyzed = termAtt.toString();
+
+ // try to increment again, there should only be one output token
+ StringBuilder multipleOutputs = null;
+ while (stream.incrementToken()) {
+ if (null == multipleOutputs) {
+ multipleOutputs = new StringBuilder();
+ multipleOutputs.append('"');
+ multipleOutputs.append(analyzed);
+ multipleOutputs.append('"');
+ }
+ multipleOutputs.append(',');
+ multipleOutputs.append('"');
+ multipleOutputs.append(termAtt.toString());
+ multipleOutputs.append('"');
+ }
+ stream.end();
+ stream.close();
+ if (null != multipleOutputs) {
+ throw new ParseException(String.format(getLocale(),
+ "Analyzer created multiple terms for \"%s\": %s", chunk,
+ multipleOutputs.toString()));
+ }
+ } else {
+ // nothing returned by analyzer. Was it a stop word and the user
+ // accidentally
+ // used an analyzer with stop words?
+ stream.end();
+ stream.close();
+ throw new ParseException(String.format(getLocale(),
+ "Analyzer returned nothing for \"%s\"", chunk));
+ }
+ } catch (IOException e) {
+ throw new ParseException(String.format(getLocale(),
+ "IO error while trying to analyze single term: \"%s\"", termStr));
+ }
+ return analyzed;
+ }
+
+ private String analyzeWildcard(String field, String termText)
+ throws ParseException {
+ // plagiarized from AnalyzingQueryParser
+ Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termText);
+ StringBuilder sb = new StringBuilder();
+ int last = 0;
+
+ while (wildcardMatcher.find()) {
+ // continue if escaped char
+ if (wildcardMatcher.group(1) != null) {
+ continue;
+ }
+
+ if (wildcardMatcher.start() > 0) {
+ String chunk = termText.substring(last, wildcardMatcher.start());
+ String analyzed = analyzeSingleChunk(field, termText, chunk);
+ sb.append(analyzed);
+ }
+ // append the wildcard character
+ sb.append(wildcardMatcher.group(2));
+
+ last = wildcardMatcher.end();
+ }
+ if (last < termText.length()) {
+ sb.append(analyzeSingleChunk(field, termText, termText.substring(last)));
+ }
+ return sb.toString();
+ }
+
+ private SpanQuery getEmptyQuery() {
+ SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]);
+ return q;
+ }
+
+ private boolean isEmptyQuery(SpanQuery q) {
+ if (q instanceof SpanOrQuery && ((SpanOrQuery) q).getClauses().length == 0) {
+ return true;
+ }
+ return false;
+ }
+
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserUtil.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserUtil.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParserUtil.java (revision 0)
@@ -0,0 +1,492 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryparser.span;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.span.clauses.ClauseInfo;
+import org.apache.lucene.queryparser.span.clauses.ClauseInfoBuilder;
+
+/**
+ * Utility class to to handle parsing
+ *
+ */
+
+public class SpanQueryParserUtil {
+
+ private final static String nearOpen = "[";
+ private final static String nearClose = "]";
+ private final static String orOpen = "(";
+ private final static String orClose = ")";
+
+ private final Set requiredRegexPres;
+ private final Set requiredRegexPosts;
+ private final static String clauseString = "\\" + nearOpen + "\\" + nearClose
+ + "\\" + orOpen + "\\" + orClose;
+ private final static String numberString = "(?:(\\d+)(?:,(\\d+))?)?";
+ private final static Pattern CLAUSE_PATTERN = Pattern.compile("(?s)(["
+ + clauseString + "])(?:(!?~>?)" + numberString + ")?");
+
+ private final static Pattern ESCAPES_PATTERN = Pattern.compile("(\\\\.)");
+ private final static Pattern REGEX_PATTERN = Pattern.compile("(\\\\/|/)");
+ private final static Pattern WHITE_SPACE_PATTERN = Pattern.compile("\\s+");
+ private final static Pattern WHITE_SPACE_ONLY_PATTERN = Pattern
+ .compile("^\\s*$");
+
+ public SpanQueryParserUtil() {
+ requiredRegexPres = new HashSet();
+ requiredRegexPres.add("(");
+ requiredRegexPres.add("[");
+ requiredRegexPres.add(" ");
+
+ requiredRegexPosts = new HashSet();
+ requiredRegexPosts.add(")");
+ requiredRegexPosts.add("]");
+ requiredRegexPosts.add(" ");
+ }
+
+ /**
+ * rewrite double quotes to [ ]
+ *
+ * @param s
+ * string
+ * @param escapedChars
+ * escaped characters
+ * @return rewritten query
+ */
+ protected String rewriteDoubleQuotes(String s, Set escapedChars) {
+ Matcher dMatcher = Pattern.compile("\"").matcher("");// "
+ int last = 0;
+ dMatcher.reset(s);
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ while (dMatcher.find()) {
+ if (escapedChars.contains(dMatcher.start())) {
+ continue;
+ }
+ sb.append(s.substring(last, dMatcher.start()));
+ if (i % 2 == 0) {
+ sb.append("[");
+ } else {
+ sb.append("]");
+ }
+ last = dMatcher.end();
+ i++;
+ }
+ sb.append(s.substring(last));
+ return sb.toString();
+ }
+
+ /**
+ * get a list of all clause markers
+ *
+ * @param s
+ * full query string
+ * @param regexes
+ * list of regex extents
+ * @param escapedChars
+ * set of escaped characters
+ * @return list of start and end clause markers
+ * @throws ParseException
+ */
+ protected List getClauseMarkers(String s,
+ List regexes, Set escapedChars)
+ throws ParseException {
+ Set inRegex = new HashSet();
+ for (OffsetAttribute regex : regexes) {
+ for (int i = regex.startOffset(); i < regex.endOffset(); i++) {
+ inRegex.add(i);
+ }
+ }
+
+ Matcher clauseMatcher = CLAUSE_PATTERN.matcher(s);
+ List markers = new ArrayList();
+ while (clauseMatcher.find()) {
+ if (inRegex.contains(clauseMatcher.start())) {
+
+ continue;
+ }
+ if (escapedChars.contains(clauseMatcher.start())) {
+
+ continue;
+ }
+
+ ClauseInfo marker = buildClauseMarker(clauseMatcher);
+ markers.add(marker);
+ }
+
+ return markers;
+ }
+
+ /**
+ * builds ClauseMarker based on a matcher that has a match to a clause marker
+ *
+ * @param m
+ * @return
+ * @throws ParseException
+ */
+ private ClauseInfo buildClauseMarker(Matcher m) throws ParseException {
+ ClauseInfo.TYPE type = null;
+ ClauseInfo.START_OR_END startOrEnd = null;
+
+ // basic marker
+ String marker = m.group(1);
+
+ // near or not near
+ String whichNear = m.group(2);
+
+ // first bit of digits as in the "2" in [foo bar]~2"
+ String firstDigits = m.group(3);
+ // second bit of digits as in the "3" in [foo bar]!~2,3"
+ // should only be non-null for not near query
+ String secondDigits = m.group(4);
+ int offsetStart = m.start();
+ int offsetEnd = m.end();
+
+ if (marker.equals(nearOpen)) {
+ type = ClauseInfo.TYPE.NEAR;
+ startOrEnd = ClauseInfo.START_OR_END.START;
+ } else if (marker.equals(orOpen)) {
+ type = ClauseInfo.TYPE.OR;
+ startOrEnd = ClauseInfo.START_OR_END.START;
+ } else if (marker.equals(orClose)) {
+ type = ClauseInfo.TYPE.OR;
+ startOrEnd = ClauseInfo.START_OR_END.END;
+ } else if (marker.equals(nearClose) && whichNear == null) {
+ type = ClauseInfo.TYPE.NEAR;
+ startOrEnd = ClauseInfo.START_OR_END.END;
+ }
+ if (type != null && startOrEnd != null) {
+ return ClauseInfoBuilder.build(startOrEnd, type, offsetStart, offsetEnd);
+ }
+ // spanNotNear
+ if (whichNear.startsWith("!")) {
+ if (firstDigits == null) {
+ return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END,
+ ClauseInfo.TYPE.NOT_NEAR, offsetStart, offsetEnd);
+ }
+
+ // there is a single slop value
+ if (firstDigits != null && secondDigits == null) {
+ int slop = 0;
+ try {
+ slop = Integer.parseInt(firstDigits);
+ } catch (NumberFormatException e) {
+ throw new ParseException(
+ String
+ .format(
+ "There should have been an integer here in span not near query: %s",
+ firstDigits));
+ }
+ return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END,
+ offsetStart, offsetEnd, slop, slop);
+ }
+ if (firstDigits != null && secondDigits != null) {
+ int pre = 0;
+ try {
+ pre = Integer.parseInt(firstDigits);
+ } catch (NumberFormatException e) {
+ throw new ParseException(
+ String
+ .format(
+ "There should have been an integer here in span not near query: %s",
+ firstDigits));
+ }
+ int post = 0;
+ try {
+ post = Integer.parseInt(secondDigits);
+ } catch (NumberFormatException e) {
+ throw new ParseException(
+ String
+ .format(
+ "There should have been an integer here in span not near query: %s",
+ secondDigits));
+ }
+
+ return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END,
+ offsetStart, offsetEnd, pre, post);
+ }
+
+ } else {
+ boolean inOrder = false;
+ // [foo bar]~ matches "foo bar" and "bar foo"
+ if (whichNear.equals("~") && firstDigits == null) {
+ return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END,
+ offsetStart, offsetEnd, 0, inOrder);
+ }
+
+ if (whichNear.endsWith(">")) {
+ inOrder = true;
+ }
+ // [foo bar]~>
+ if (firstDigits == null) {
+ return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END,
+ offsetStart, offsetEnd, 0, inOrder);
+ }
+ int slop = 0;
+ try {
+ slop = Integer.parseInt(firstDigits);
+ } catch (NumberFormatException e) {
+ throw new ParseException(
+ "A span-near query should have a slop value of only one number.");
+ }
+ return ClauseInfoBuilder.build(ClauseInfo.START_OR_END.END, offsetStart,
+ offsetEnd, slop, inOrder);
+ }
+ throw new ParseException(String.format(
+ "Failed to parse: %s and its attributes", marker));
+
+ }
+
+ /**
+ * From a full list of clause markers for the string, find the closing clause
+ * marker for the marker at {start}
+ *
+ * @param clauses
+ * @param start
+ * @return offset in the list of clausemarkers where the match is
+ * @throws ParseException
+ * if the matching marker couldn't be found
+ */
+ protected int findMatching(List clauses, int start)
+ throws ParseException {
+ ClauseInfo startMarker = clauses.get(start);
+ int depth = 0;
+ for (int i = start; i < clauses.size(); i++) {
+ if (ClauseInfo.matchOpenClose(startMarker, clauses.get(i))) {
+ depth++;
+ } else if (startMarker.getType().equals(clauses.get(i).getType())) {
+ depth--;
+ }
+
+ if (depth == 0)
+ return i;
+
+ if (depth > 0)
+ throw new ParseException("too many end markers");
+
+ }
+ throw new ParseException("couldn't find matching clause markers");
+ }
+
+ /**
+ * Extracts terms from a string that may contain regexes but contains no
+ * clausal boundaries. For example, you would pass "foo /bat/ bar" from the
+ * larger query [ pre [ foo /bat/ bar] post]~10 This extracts terms from the
+ * target span of a full string.
+ *
+ * @param s
+ * @param targetSpanStart
+ * @param targetSpanEnd
+ * @param regexes
+ * list of regexes within the full string
+ * @param escapedChars
+ * set of escaped chars within the full string
+ * @return
+ */
+ protected List extractTermStringsBasedOnWhitespace(String s,
+ int targetSpanStart, int targetSpanEnd, List regexes,
+ Set escapedChars) {
+ // This is meant to extract terms from a string that may contain regexes
+ // but contains no clausal boundaries.
+ // This extracts Terms from the target span of a full String.
+ List terms = new ArrayList();
+
+ // end early if start == end
+ if (targetSpanStart >= targetSpanEnd) {
+ return terms;
+ }
+ int tmpStart = targetSpanStart;
+ for (OffsetAttribute regex : regexes) {
+ if (regex.endOffset() < targetSpanStart) {
+ continue;
+ } else if (regex.startOffset() > targetSpanEnd) {
+ break;
+ }
+ if (regex.startOffset() - 1 >= 0) {
+ // extract terms before the regex
+ List tmp = extractTermsBasedOnWhitespace(s, tmpStart,
+ (regex.startOffset() - 1), escapedChars);
+ terms.addAll(tmp);
+ // extract regex
+ terms.add(s.substring(regex.startOffset(), regex.endOffset() + 1));
+ }
+
+ tmpStart = regex.endOffset() + 1;
+ }
+ // extract terms after regex
+ List tmp = extractTermsBasedOnWhitespace(s, tmpStart,
+ targetSpanEnd, escapedChars);
+ terms.addAll(tmp);
+ return terms;
+ }
+
+ /**
+ * This is meant to extract terms from a string that contains no clausal
+ * boundaries and no regexes.
+ *
+ * This extracts Strings from the target span of a full String. This unescapes
+ * the strings.
+ *
+ * You still need to use an analyzer for non-whitespace languages to break up
+ * the returned strings into tokens
+ *
+ * @param s
+ * @param targetSpanStart
+ * @param targetSpanEnd
+ * @param escapedChars
+ * @return
+ */
+
+ protected List extractTermsBasedOnWhitespace(String s,
+ int targetSpanStart, int targetSpanEnd, Set escapedChars) {
+ List termStrings = new ArrayList();
+ // stop early if the start and end are ==
+ if (targetSpanStart >= targetSpanEnd) {
+ return termStrings;
+ }
+ Matcher whiteSpaceSplitter = WHITE_SPACE_PATTERN.matcher(s);
+ Matcher unescaper = ESCAPES_PATTERN.matcher("");
+ Matcher whiteSpaceOnly = WHITE_SPACE_ONLY_PATTERN.matcher("");
+ int start = targetSpanStart;
+ int last = start;
+
+ whiteSpaceSplitter = whiteSpaceSplitter.region(targetSpanStart,
+ targetSpanEnd);
+ while (whiteSpaceSplitter.find()) {
+ start = whiteSpaceSplitter.end();
+ if (escapedChars.contains(whiteSpaceSplitter.start())) {
+ continue;
+ }
+ String tmp = s.substring(last, whiteSpaceSplitter.start());
+ if (!whiteSpaceOnly.reset(tmp).find()) {
+ unescaper.reset(tmp);
+ tmp = unescaper.replaceAll("$1");
+
+ termStrings.add(tmp);
+ }
+ last = whiteSpaceSplitter.end();
+ }
+ String tmp = s.substring(last, targetSpanEnd);
+ if (!whiteSpaceOnly.reset(tmp).find()) {
+ unescaper.reset(tmp);
+ tmp = unescaper.replaceAll("$1");
+ termStrings.add(tmp);
+ }
+ return termStrings;
+ }
+
+ /**
+ * Extracts the regex extents within the string
+ *
+ * @param s
+ * @param escapedChars
+ * @return
+ * @throws ParseException
+ */
+ protected List extractRegexes(String s,
+ Set escapedChars) throws ParseException {
+ List offsets = new ArrayList();
+ Matcher m = REGEX_PATTERN.matcher(s);
+ boolean inRegex = false;
+ int start = -1;
+ while (m.find()) {
+ if (m.group(1).equals("/") && !escapedChars.contains(m.start())) {
+
+ if (inRegex == true && testRegexPost(s, m.start(), escapedChars)) {
+ OffsetAttribute offset = new OffsetAttributeImpl();
+ // really, we mean it, leave in the -1
+ offset.setOffset(start, m.end() - 1);
+ offsets.add(offset);
+ inRegex = false;
+ } else {
+ if (testRegexPre(s, m.start(), escapedChars)) {
+ inRegex = true;
+ start = m.start();
+ }
+ }
+ }
+ }
+ if (inRegex == true) {
+ throw new ParseException("Unmatched / in regex");
+ }
+
+ return offsets;
+ }
+
+ /**
+ * test that the character before the regex looks like a regex boundary
+ *
+ * @param s
+ * @param i
+ * @param escapedChars
+ * @return
+ */
+ private boolean testRegexPre(String s, int i, Set escapedChars) {
+ int pre = i - 1;
+ if (pre < 0) {
+ return true;
+ } else if (escapedChars.contains(pre)) {
+ return false;
+ } else if (requiredRegexPres.contains(s.substring(pre, pre + 1))) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * test that the character after the regex looks like a regex boundary
+ *
+ * @param s
+ * @param i
+ * @param escapedChars
+ * @return
+ */
+ private boolean testRegexPost(String s, int i, Set escapedChars) {
+ int post = i + 1;
+ // if term ends string
+ if (post >= s.length() - 1) {
+ return true;
+ } else if (escapedChars.contains(post)) {
+ return false;
+ } else if (post + 1 < s.length()
+ && requiredRegexPosts.contains(s.substring(post, post + 1))) {
+ return true;
+ }
+ return false;
+ }
+
+ protected Set getEscapedExtents(String s) {
+ Set ints = new HashSet();
+ Matcher m = ESCAPES_PATTERN.matcher(s);
+
+ while (m.find()) {
+ ints.add(m.start());
+ ints.add(m.start() + 1);
+ }
+ return ints;
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParser.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParser.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/SpanQueryParser.java (revision 0)
@@ -0,0 +1,394 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.span.clauses.ClauseInfo;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+
+/**
+ *
+ * Parses a query into a {@link SpanQuery} which can be used to fetch
+ * {@link Span}s or with IndexSearcher. This parser includes functionality from:
+ *
+ * - {@link org.apache.lucene.queryparser.classic.QueryParser classic
+ * QueryParser}: most of its syntax
+ * - {@link org.apache.lucene.queryparser.surround.parser.QueryParser
+ * SurroundQueryParser}: recursive parsing for "near" and "not" clauses.
+ * - {@link ComplexPhraseQueryParser}: can handle "near" queries that include
+ * multiterms ({@link WildcardQuery}, {@link FuzzyQuery}, {@link RegexpQuery}).
+ * - {@link AnalyzingQueryParser}: has an option to analyze multiterms.
+ *
+ *
+ *
+ *
+ *
+ * Background This parser was developed for the concordance/analytic
+ * search use case -- the user wants to see every time a span occurs (perhaps
+ * with a separate FilterQuery). While the SpanQuery that this parser generates
+ * can be used as a Query for traditional information retrieval via
+ * IndexSearcher, this syntax offers far more power than the classic syntax ,
+ * and it may not be needed in the general IR use case.
+ *
+ *
+ *
+ *
+ * With luck, this parser will be made obsolete with Lucene-2878, but until
+ * then, this parser fills a niche.
+ *
+ *
+ * One goal was to keep the syntax as close to Lucene's classic
+ * {@link QueryParser} as possible.
+ *
+ *
+ * Similarities and Differences
+ *
+ *
+ *
+ * Same as classic syntax:
+ *
+ * - term: test
+ * - fuzzy: roam~0.8, roam~2
+ * - wildcard: te?t, test*, t*st
+ * - regex:
/[mb]oat/
+ * - phrase: "jakarta apache"
+ * - phrase with slop: "jakarta apache"~3
+ * - "or" clauses: jakarta apache
+ * - grouping clauses: (jakarta apache)
+ *
+ *
+ *
+ * Main additions in SpanQueryParser syntax vs. classic syntax:
+ *
+ * - Can require "in order" for phrases with slop with the ~> operator:
+ * "jakarta apache"~>3
+ * - Can specify "not near" "bieber fever"!~3,10 :: find
+ * "bieber" but not if "fever" appears within 3 words before
+ * or 10 words after it.
+ * - Fully recursive phrasal queries with [ and ]; as in: [[jakarta apache]~3
+ * lucene]~>4 :: find "jakarta" within 3 words of "apache",
+ * and that hit has to be within four words before "lucene".
+ * - Can also use [] for single level phrasal queries instead of ""
+ * as in: [jakarta apache]
+ * - Can use "or" clauses in phrasal queries: "apache (lucene
+ * solr)"~3 :: find "apache" and then either "lucene"
+ * or "solr" within three words.
+ * - Can use multiterms in phrasal queries: "jakarta~1 ap*che"~2
+ * - Did I mention recursion: [[jakarta~1 ap*che]~2 (solr~
+ * /l[ou]+[cs][en]+/)]~10 :: Find something like "jakarta" within two
+ * words of "ap*che" and that hit has to be within ten words of
+ * something like "solr" or that lucene regex.
+ *
+ *
+ *
+ *
+ *
+ * Limitations of SpanQueryParser compared with classic QueryParser:
+ *
+ * - SpanQueryParser can create a query for only one field.
+ * - Boolean queries are not allowed. There is no AND operator; statements
+ * with more than one term are either "or'd" or handled in proximity
+ * queries
+ * - Boosting is not currently supported
+ * - {@link RangeQuery}s are not yet supported.
+ * - This parser is not built with .jj or the antlr parser framework.
+ * Regrettably, because it is generating a {@link SpanQuery}, it can't use all
+ * of the generalizable queryparser infrastructure that was added with Lucene
+ * 4.+.
+ *
+ *
+ *
+ * Stop word handling
+ *
+ *
+ * The user can choose to throw a {@link ParseException} if a stop word is
+ * encountered. If {@link SpanQueryParserBase.throwExceptionForStopWord} is set
+ * to false (default), the following should happen.
+ *
+ *
+ *
+ * - Term: "the" will return an empty {@link BooleanSpanQuery} (similar to
+ * classic queryparser)
+ * - SpanOr: (the apache jakarta) will drop the stop word and return a
+ * {@link SpanOrQuery} for "apache" or "jakarta"
+ *
- SpanNear: "apache and jakarta" will drop the "and" and match on only
+ * "apache jakarta"
+ *
-
+ *
+ *
+ *
+ * Expert: Other subtle differences between SpanQueryParser and classic
+ * QueryParser.
+ *
+ * - Fuzzy queries with slop > 2 are handled by SlowFuzzyQuery. The developer
+ * can set the fuzzyMaxEdits.
+ * - Regex term queries must currently be preceded or followed by a
+ * parenthesis, a square bracket, white space or the start or end of the string.
+ *
+ * - "jakarta /ap[aeiou]*che/" is allowed
+ * - "jakarta (/ap[aeiou]*che/ /lucene?/)" is allowed
+ * - "jakarta/ap[aeiou]*che/" is not allowed
+ *
+ *
+ * - Fuzzy queries with edit distance >=1 are rounded so that an exception is
+ * not thrown.
+ *
+ *
+ *
+ * NOTE You must add the sandbox jar to your class path to include the
+ * currently deprecated {@link SlowFuzzyQuery}.
+ *
+ *
+ */
+public class SpanQueryParser extends SpanQueryParserBase {
+
+ private static final Pattern FUZZY_PATTERN = Pattern
+ .compile("(?s)^(.+)~(\\d+)?(?:\\.(\\d+))?$");
+ private static final Pattern WILDCARD_PATTERN = Pattern.compile("([?*])");
+ private static final Pattern REGEX_PATTERN = Pattern
+ .compile("(?s)^\\/(.+?)\\/$");
+ private static final Pattern ESCAPE_PATTERN = Pattern.compile("\\\\.");// .");
+
+ /**
+ * Initialize with field and analyzer. This parser can only process a single
+ * field. It will use the analyzer for normalizing query terms and for
+ * tokenizing character runs from non-whitespace languages.
+ *
+ * @param field
+ * @param analyzer
+ */
+ public SpanQueryParser(String field, Analyzer analyzer) {
+ init(field, analyzer);
+ }
+
+ /**
+ * returns {@link SpanQuery} or null if an empty string or no parseable
+ * content is passed in.
+ */
+ public SpanQuery parse(String s) throws ParseException {
+ SpanQueryParserUtil parserUtil = new SpanQueryParserUtil();
+ // treat every query as if it were a big spanOr
+ // there is an unsettling, yet small inefficiency to this; fix if solution
+ // is obvious
+
+ StringBuilder sb = new StringBuilder();
+ sb.append("(").append(s).append(")");
+ s = sb.toString();
+ Set escapedChars = parserUtil.getEscapedExtents(s);
+ s = parserUtil.rewriteDoubleQuotes(s, escapedChars);
+ List regexes = parserUtil.extractRegexes(s, escapedChars);
+ List clauses = parserUtil.getClauseMarkers(s, regexes,
+ escapedChars);
+
+ return parse(parserUtil, getField(), s, 0, clauses, regexes, escapedChars);
+ }
+
+ private SpanQuery parse(SpanQueryParserUtil util, String field, String s,
+ int startMarkerIndex, List clauseMarkers,
+ List regexes, Set escapedChars)
+ throws ParseException {
+
+ if (s == null || s.length() == 0)
+ return null;
+
+ ClauseInfo startMarker = clauseMarkers.get(startMarkerIndex);
+ int endMarkerIndex = util.findMatching(clauseMarkers, startMarkerIndex);
+ ClauseInfo endMarker = clauseMarkers.get(endMarkerIndex);
+
+ List queryClauses = new ArrayList();
+
+ int childStartInd = startMarkerIndex + 1;
+ int childEndInd = -1;
+ int lastStartChar = startMarker.getEnd();
+
+ while (childStartInd < endMarkerIndex) {
+
+ childEndInd = util.findMatching(clauseMarkers, childStartInd);
+
+ // handle the stuff before the clauseMarkers
+ int tmpStart = lastStartChar;
+ int tmpEnd = clauseMarkers.get(childStartInd).getStart();
+ List preTermQueries = parseBasicTerms(util, field, s,
+ tmpStart, tmpEnd, regexes, escapedChars);
+ for (SpanQuery q : preTermQueries) {
+ queryClauses = addQuery(q, queryClauses);
+ }
+ SpanQuery tmpQ = parse(util, field, s, childStartInd, clauseMarkers,
+ regexes, escapedChars);
+ queryClauses = addQuery(tmpQ, queryClauses);
+ lastStartChar = clauseMarkers.get(childEndInd).getEnd();
+ childStartInd = childEndInd + 1;
+
+ }
+
+ int endInd = (childEndInd > -1) ? childEndInd : startMarkerIndex;
+ int contentOffsetStart = clauseMarkers.get(endInd).getEnd();
+ int contentOffsetEnd = endMarker.getStart();
+ List postTermQueries = parseBasicTerms(util, field, s,
+ contentOffsetStart, contentOffsetEnd, regexes, escapedChars);
+ for (SpanQuery q : postTermQueries) {
+ queryClauses = addQuery(q, queryClauses);
+ }
+
+ return buildQuery(queryClauses, endMarker);
+ }
+
+ private List parseBasicTerms(SpanQueryParserUtil util,
+ String field, String s, int start, int end,
+ List regexes, Set escapedChars)
+ throws ParseException {
+
+ List termStrings = util.extractTermStringsBasedOnWhitespace(s,
+ start, end, regexes, escapedChars);
+
+ return convertTermStringsToSpanQueries(field, termStrings);
+ }
+
+ private List addQuery(SpanQuery q, List list) {
+ if (null != q)
+ list.add(q);
+ return list;
+ }
+
+ /**
+ * Simply convert termStrs to SpanQueries.
+ *
+ * @param field
+ * @param strings
+ * @return
+ * @throws ParseException
+ */
+ private List convertTermStringsToSpanQueries(String field,
+ List strings) throws ParseException {
+ List terms = new ArrayList();
+ for (String s : strings) {
+ SpanQuery tmpT = buildAnyTermQuery(s);
+ if (tmpT != null) {
+ terms = addQuery(tmpT, terms);
+ }
+ }
+ return terms;
+ }
+
+ /**
+ * This identifies and then builds the various span term and/or multiterm
+ * queries. Protected for testing purposes.
+ *
+ *
+ * For {@link FuzzyQuery}, this defaults to {@link FuzzyQuery.defaultMaxEdits}
+ * if no value is specified after the ~.
+ *
+ * @param termText
+ * @return SpanQuery or null if termText is a stop word
+ * @throws ParseException
+ * @throws IOException
+ */
+ protected SpanQuery buildAnyTermQuery(String termText) throws ParseException {
+ // TODO: add range query
+ // is this a regex term?
+ Matcher m = REGEX_PATTERN.matcher(termText);
+ if (m.find()) {
+ return buildRegexTermQuery(getField(), m.group(1));
+ }
+
+ Set escapes = new HashSet();
+ m = ESCAPE_PATTERN.matcher(termText);
+ while (m.find()) {
+ escapes.add(m.end() - 1);
+ }
+ SpanQuery q = null;
+
+ // is this a fuzzy term?
+ m = FUZZY_PATTERN.matcher(termText);
+ if (m.find()) {
+ String term = m.group(1);
+ // if this is not actually an escaped fuzzy marker!!!
+ if (!escapes.contains(m.end(1))) {
+
+ String slopString = m.group(2);
+ String decimalComponent = m.group(3);
+ float slop = (float) FuzzyQuery.defaultMaxEdits;
+ if (slopString != null) {
+ if (decimalComponent == null || decimalComponent.length() == 0) {
+ decimalComponent = "0";
+ }
+ try {
+ slop = Float.parseFloat(slopString + "." + decimalComponent);
+ } catch (NumberFormatException e) {
+ // shouldn't ever happen. If it does, fall back to original value of
+ // slop
+ // swallow
+ }
+
+ }
+ // if the user enters 2.4 for example, round it so that there won't be
+ // an
+ // illegalparameter exception
+ if (slop >= 1.0f) {
+ slop = (float) Math.round(slop);
+ }
+ q = buildFuzzyTermQuery(getField(), term, slop);
+ }
+ }
+
+ // is this a wildcard term?
+ m = WILDCARD_PATTERN.matcher(termText);
+ Set ws = new HashSet();
+ while (m.find()) {
+ if (!escapes.contains(m.start())) {
+ ws.add(m.start());
+ }
+ }
+ if (ws.size() > 0) {
+ if (q != null) {
+ throw new ParseException(
+ "Can't have a single term in a query that is both a wildcard and a fuzzy query");
+ }
+
+ if (ws.size() == 1 // there's only one wildcard character
+ && ws.contains(termText.length() - 1) // it isn't escaped
+ && termText.indexOf("*") == termText.length() - 1 // it is * not ?
+ ) {
+ // snip final *
+ q = buildPrefixQuery(getField(),
+ termText.substring(0, termText.length() - 1));
+ } else {
+ q = buildWildcardQuery(getField(), termText);
+ }
+ }
+
+ // if you've found anything, return it
+ if (q != null) {
+ return q;
+ }
+ // treat as basic single term query
+ return buildSingleTermQuery(getField(), termText);
+ }
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/NormMultiTerm.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/NormMultiTerm.java (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/NormMultiTerm.java (revision 0)
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queryparser.span;
+
+/**
+ *
+ * options for handling a multiterm: wildcard, prefix, fuzzy (not regexp!)
+ *
+ */
+public enum NormMultiTerm {
+ ANALYZE, LOWERCASE, NO_NORM
+}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/span/package.html
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/span/package.html (revision 0)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/span/package.html (revision 0)
@@ -0,0 +1,24 @@
+
+
+
+
+SpanQueryParser generates a SpanQuery, which can be used to grab spans or in
+traditional document retrieval with IndexSearcher.
+
+
+