Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (revision 1576127) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (working copy) @@ -368,11 +368,17 @@ public void testStopWords() throws Exception { // Stop word handling has some room for improvement with SpanQuery - // These tests codify the expectations (for regular behavior, - // parse exceptions and false hits) as of this writing. SpanOnlyParser p = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, stopAnalyzer); + // this matches both "over the lazy" and "over green lazy" + countSpansDocs(p, "[over the lazy]", 2, 2); + + //this matches "over" or "lazy" + countSpansDocs(p, "(over the lazy)", 4, 2); + countSpansDocs(p, "(over the)", 2, 2); + countSpansDocs(p, "(the and and the)", 0, 0); + countSpansDocs(p, "the", 0, 0); // these are whittled down to just a query for brown @@ -380,37 +386,27 @@ countSpansDocs(p, "(the brown)", 3, 2); - testException(p, "[brown the]!~5,5"); + countSpansDocs(p, "[brown the]!~5,5", 3, 2); + + + //this tests that slop is really converted to 2 because of stop word + countSpansDocs(p, "[over the brown]~1", 1, 1); - // this will not match because "the" is silently dropped from the query - countSpansDocs(p, "[over the lazy]", 0, 0); + //this tests that slop is really converted to 2, not 3 because of stop word + countSpansDocs(p, "[over the dog]~1", 0, 0); - // this will get one right hit, but incorrectly match "over green lazy" - countSpansDocs(p, "[over the lazy]~1", 2, 2); + //this tests that slop is really converted to 3 because of stop words + countSpansDocs(p, "[over the the dog]~1", 1, 1); - // test throw exception - p.setThrowExceptionForEmptyTerm(true); - p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE); + //this tests that slop is not augmented for stops before first non-stop + //and after last non-stop + countSpansDocs(p, "[the the the the brown (dog cat) the the the the]", 1, 1); - String[] stopExs = new String[]{ - "the", - "[the brown]", - "the brown", - "(the brown)", - "\"the brown\"", - "\"the\"", - "[the brown]!~2,2", - "[brown the]!~2,2", - "the*ter", - "the?ter" - }; + //ditto + countSpansDocs(p, "[the the the the jumped the cat the the the the]~1", 0, 0); + countSpansDocs(p, "[the the the the over the brown the the the the]~1", 1, 1); - for (String ex : stopExs) { - testException(p, ex); - } - // add tests for surprise phrasal with stopword!!! chinese - SpanOnlyParser noStopsParser = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, noStopAnalyzer); noStopsParser.setAutoGeneratePhraseQueries(true); // won't match because stop word was dropped in index @@ -421,6 +417,7 @@ testOffsetForSingleSpanMatch(noStopsParser, "[\u666E \u6797 \u65AF \u987F \u5B66]~2", 6, 0, 6); + } public void testNonWhiteSpaceLanguage() throws Exception { @@ -483,19 +480,9 @@ SpanOnlyParser stopsParser = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, stopAnalyzer); stopsParser.setAutoGeneratePhraseQueries(true); countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); - - // now test for throwing of exception - stopsParser.setThrowExceptionForEmptyTerm(true); - boolean exc = false; - try { - countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0); - } catch (ParseException e) { - exc = true; - } - assertEquals(true, exc); } - public void testQuotedSingleTerm() throws Exception{ + public void testQuotedSingleTerm() throws Exception { SpanOnlyParser p = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, noStopAnalyzer); String[] quoteds = new String[]{ Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (revision 1576127) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (working copy) @@ -22,8 +22,9 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -57,19 +58,16 @@ @BeforeClass public static void beforeClass() throws Exception { - analyzer = new Analyzer() { - @Override - public TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, - false); - return new TokenStreamComponents(tokenizer, tokenizer); - } - }; + + analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); + String[] f1Docs = new String[] { "quick brown AND fox",//0 "quick brown AND dog", //1 @@ -312,4 +310,61 @@ q = parser.parse("abc\\\\*d"); assertTrue(q.toString(), q instanceof SpanMultiTermQueryWrapper); } + + public void testStops() throws Exception { + Analyzer stopsAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, + MockTokenFilter.ENGLISH_STOPSET); + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, stopsAnalyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { + "ab the the cd the the the ef the gh", + "ab cd", + "ab the ef" + }; + + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); + w.addDocument(doc); + } + IndexReader r = w.getReader(); + IndexSearcher s = newSearcher(r); + w.close(); + SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, stopsAnalyzer); + assertHits( "-ab +the +cd", p, s, 0); + assertHits( "+ab +the +cd", p, s, 2); + assertHits( "+the", p, s, 0); + assertHits( "ab AND CD", p, s, 2); + assertHits( "ab AND the", p, s, 3); + assertHits( "ab OR the", p, s, 3); + assertHits( "(ab the cd)~2", p, s, 2); + assertHits( "(ab the cd)~3", p, s, 0); + assertHits( "ab AND (the OR cd)", p, s, 2); + assertHits( "ab AND (the AND cd)", p, s, 2); + assertHits( "cd OR (the OR ef)", p, s, 3); + assertHits( "cd AND (the AND ef)", p, s, 1); + //do we want this behavior? + assertHits( "-the", p, s, 0); + + assertHits ("\"ab cd\"", p, s, 1); + assertHits ("\"ab a a cd\"", p, s, 2); + assertHits ("\"ab a cd\"~1", p, s, 2); + assertHits ("\"ab a cd\"~>1", p, s, 2); + assertHits ("\"cd a a ab\"", p, s, 0); + assertHits ("\"cd a ab\"~1", p, s, 2); + + r.close(); + dir.close(); + } + + private void assertHits(String qString, SpanQueryParser p, IndexSearcher s, int expected) throws Exception { + Query q = p.parse(qString); + TopScoreDocCollector results = TopScoreDocCollector.create(1000, true); + s.search(q, results); + ScoreDoc[] scoreDocs = results.topDocs().scoreDocs; + assertEquals(qString, expected, scoreDocs.length); + } } Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (revision 1576127) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (working copy) @@ -96,9 +96,6 @@ private int spanNearMaxDistance = 100; private int spanNotNearMaxDistance = 50; - //if a full term is analyzed and the analyzer returns nothing, - //should a ParseException be thrown or should I just ignore the full token. - private boolean throwExceptionForEmptyTerm = false; private boolean lowercaseRegex = false; //////// @@ -654,9 +651,6 @@ BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef(); if (numTokens == 0) { - if (throwExceptionForEmptyTerm) { - throw new ParseException("Couldn't find any content term in: "+ termText); - } return null; } else if (numTokens == 1) { try { @@ -688,9 +682,7 @@ for (SpanQuery piece : queries) { if (piece != null) { nonEmpties.add(piece); - } else if (piece == null && throwExceptionForEmptyTerm) { - throw new ParseException("Stop word found in " + termText); - } + } } if (nonEmpties.size() == 0) { @@ -842,7 +834,34 @@ if (clauses == null || clauses.size() == 0) return getEmptySpanQuery(); - List nonEmpties = removeEmpties(clauses); + List nonEmpties = new LinkedList(); + //find first non-null and last non-null entry + int start = 0; + int end = clauses.size(); + for (int i = 0; i < clauses.size(); i++) { + if (! isEmptyQuery(clauses.get(i))) { + start = i; + break; + } + } + for (int i = clauses.size()-1; i >= 0; i--) { + if (! isEmptyQuery(clauses.get(i))) { + end = i+1; + break; + } + } + + //now count the stop words that occur + //between the first and last non-null + int numIntermedStops = 0; + for (int i = start; i < end; i++) { + SpanQuery clause = clauses.get(i); + if (!isEmptyQuery(clause)){ + nonEmpties.add(clause); + } else { + numIntermedStops++; + } + } if (nonEmpties.size() == 0) { return getEmptySpanQuery(); @@ -858,7 +877,13 @@ if (slop == UNSPECIFIED_SLOP) { slop = getPhraseSlop(); - } else if (spanNearMaxDistance > -1 && slop > spanNearMaxDistance) { + } + + //adjust slop to handle intermediate stops that + //were removed + slop += numIntermedStops; + + if (spanNearMaxDistance > -1 && slop > spanNearMaxDistance) { slop = spanNearMaxDistance; } @@ -933,9 +958,7 @@ for (SpanQuery q : queries) { if (!isEmptyQuery(q)) { nonEmpties.add(q); - } else if (throwExceptionForEmptyTerm) { - throw new ParseException("Stop word or unparseable term found"); - } + } } return nonEmpties; } @@ -1015,31 +1038,6 @@ this.spanNotNearMaxDistance = spanNotNearMaxDistance; } - /** - * If the a term passes through the analyzer and nothing comes out, - * throw an exception or silently ignore the missing term. This can - * happen with stop words or with other strings that the analyzer - * ignores. - * - *

- * This is applied only at the full term level. - *

- * Currently, a parseException is thrown no matter the setting on this - * whenever an analyzer can't return a value for a multiterm query. - * - * @return throw exception if analyzer yields empty term - */ - public boolean getThrowExceptionForEmptyTerm() { - return throwExceptionForEmptyTerm; - } - - /** - * @see #getThrowExceptionForEmptyTerm() - */ - public void setThrowExceptionForEmptyTerm(boolean throwExceptionForEmptyTerm) { - this.throwExceptionForEmptyTerm = throwExceptionForEmptyTerm; - } - protected static boolean isCharEscaped(String s, int i) { int j = i; int esc = 0; Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (revision 1576127) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (working copy) @@ -33,7 +33,7 @@ import org.apache.lucene.util.Version; /** - *

This parser leverages the power of SpanQuery and can combine them with + * This parser leverages the power of SpanQuery and can combine them with * traditional boolean logic and multiple field information. * This parser includes functionality from: *

    @@ -45,14 +45,11 @@ *
  • {@link org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser}: has an option to analyze multiterms.
  • *
* - *

- * - *

+ *

* Background * This parser is designed to expose as much of the sophistication as is available within the Query/SpanQuery components. * The basic approach of this parser is to build BooleanQueries comprised of SpanQueries. The parser recursively works * through boolean/fielded chunks and then recursively works through SpanQueries. - *

* *

* Goals for this parser: @@ -63,8 +60,7 @@ *

  • Make analysis of multiterms a fundamental part of the parser * {@link AnalyzingQueryParserBase}.
  • * - *

    - *

    Similarities and Differences

    + *

    Similarities and Differences * *

    Same as classic syntax: *

      @@ -81,7 +77,6 @@ *
    • required/not required operators: +lucene +apache -jakarta
    • *
    • boolean with field:(author:hatcher AND author:gospodnetic) AND title:lucene
    • *
    - *

    *

    Main additions in SpanQueryParser syntax vs. classic: *

      *
    • Can require "in order" for phrases with slop with the ~> operator: "jakarta apache"~>3
    • @@ -104,7 +99,6 @@ *
    • Can require at least x number of hits at boolean level: "apache AND (lucene solr tika)~2
    • *
    • Can have a negative query: -jakarta will return all documents that do not contain jakarta
    • *
    - *

    *

    * Trivial additions: *

      @@ -116,7 +110,6 @@ *

      Analysis * You can specify different analyzers * to handle whole term versus multiterm components. - *

      * *

      * Using quotes for a single term @@ -125,37 +118,42 @@ * Remember to use quotes or use escapes for anything with backslashes or hyphens: * 12/02/04 (is broken into a term "12", a regex "/02/" and a term "04") * '12/02/04' is treated a a single token. - *

      + * + * *

      Stop word handling - *

      - *

      The user can choose to throw a {@link org.apache.lucene.queryparser.classic.ParseException} if a stop word is encountered. - * If SpanQueryParserBase.throwExceptionForEmptyTerm is set to false (default), the following should happen. - *

      + *

      The parser tries to replicate the behavior of the Classic QueryParser. Stop words + * are generally ignored. + *

      However, in a "near" query, extra slop is added for each stop word that + * occurs after the first non-stop word and before the last non-stop word (or, initial and trailing stop words + * are ignored in the additions to slop). + * For example, "walked the dog" is converted to "walked dog"~>1 behind the scenes. Like the Classic + * QueryParser this will lead to false positives with any word between "walked" and "dog". Unlike + * Classic QueryParser, this will also lead to false positives of "walked dog". *

      + * Examples + *

      *

        *
      • Term: "the" will return an empty SpanQuery (similar to classic queryparser)
      • *
      • BooleanOr: (the apache jakarta) will drop the stop word and return a * {@link org.apache.lucene.search.spans.SpanOrQuery} for "apache" * or "jakarta" - *
      • SpanNear: "apache and jakarta" will drop the "and" and match on only "apache jakarta"
      • - *

      + *
    • SpanNear: "apache and jakarta" will drop the "and", add one to the slop and match on + * any occurrence of "apache" followed by "jakarta" and zero or one words intervening.
    • + *
    + * *

    A parse exception is currently always thrown if the parser analyzes a multiterm, and a subcomponent of the * multiterm has a stopword: the*tre - *

    *

    Expert: Other subtle differences between SpanQueryParser and classic QueryParser. *

      *
    • Fuzzy queries with slop > 2 are handled by SlowFuzzyQuery. The developer can set the minFuzzySim to limit * the maximum edit distance (i.e. turn off SlowFuzzyQuery by setting fuzzyMinSim = 2.0f.
    • *
    • Fuzzy queries with edit distance >=1 are rounded so that an exception is not thrown.
    • *
    - *

    *

    Truly Expert: there are a few other very subtle differences that are documented in comments * in the sourcecode in the header of SpanQueryParser. - *

    *

    * NOTE You must add the sandbox jar to your class path to include * the currently deprecated {@link org.apache.lucene.sandbox.queries.SlowFuzzyQuery}. - *

    *

    Limitations of SpanQueryParser compared with classic QueryParser: *

      *
    1. There is some learning curve to figure out the subtle differences in syntax between @@ -170,8 +168,8 @@ * Regrettably, because it is generating a {@link org.apache.lucene.search.spans.SpanQuery}, * it can't use all of the generalizable queryparser infrastructure that was added with Lucene 4.+.
    2. *
    - *

    */ + public class SpanQueryParser extends AbstractSpanQueryParser { /* Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java =================================================================== --- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java (revision 1576127) +++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/AbstractSpanQueryParser.java (working copy) @@ -78,13 +78,8 @@ } else { throw new ParseException("Can't process field, boolean operators or a match all docs query in a pure span."); } - if (q != null) { - queries.add(q); - } + queries.add(q); } - if (queries == null || queries.size() == 0) { - return getEmptySpanQuery(); - } return buildSpanQueryClause(queries, parentClause); } @@ -126,6 +121,13 @@ private SpanQuery buildSpanQueryClause(List queries, SQPClause clause) throws ParseException { + //queries can be null + //queries can contain null elements + + if (queries == null) { + return getEmptySpanQuery(); + } + SpanQuery q = null; if (clause instanceof SQPOrClause) { q = buildSpanOrQuery(queries);