Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (revision 1576127)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestSpanOnlyQueryParser.java (working copy)
@@ -368,11 +368,17 @@
public void testStopWords() throws Exception {
// Stop word handling has some room for improvement with SpanQuery
- // These tests codify the expectations (for regular behavior,
- // parse exceptions and false hits) as of this writing.
SpanOnlyParser p = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, stopAnalyzer);
+ // this matches both "over the lazy" and "over green lazy"
+ countSpansDocs(p, "[over the lazy]", 2, 2);
+
+ //this matches "over" or "lazy"
+ countSpansDocs(p, "(over the lazy)", 4, 2);
+ countSpansDocs(p, "(over the)", 2, 2);
+ countSpansDocs(p, "(the and and the)", 0, 0);
+
countSpansDocs(p, "the", 0, 0);
// these are whittled down to just a query for brown
@@ -380,37 +386,27 @@
countSpansDocs(p, "(the brown)", 3, 2);
- testException(p, "[brown the]!~5,5");
+ countSpansDocs(p, "[brown the]!~5,5", 3, 2);
+
+
+ //this tests that slop is really converted to 2 because of stop word
+ countSpansDocs(p, "[over the brown]~1", 1, 1);
- // this will not match because "the" is silently dropped from the query
- countSpansDocs(p, "[over the lazy]", 0, 0);
+ //this tests that slop is really converted to 2, not 3 because of stop word
+ countSpansDocs(p, "[over the dog]~1", 0, 0);
- // this will get one right hit, but incorrectly match "over green lazy"
- countSpansDocs(p, "[over the lazy]~1", 2, 2);
+ //this tests that slop is really converted to 3 because of stop words
+ countSpansDocs(p, "[over the the dog]~1", 1, 1);
- // test throw exception
- p.setThrowExceptionForEmptyTerm(true);
- p.setNormMultiTerms(NORM_MULTI_TERMS.ANALYZE);
+ //this tests that slop is not augmented for stops before first non-stop
+ //and after last non-stop
+ countSpansDocs(p, "[the the the the brown (dog cat) the the the the]", 1, 1);
- String[] stopExs = new String[]{
- "the",
- "[the brown]",
- "the brown",
- "(the brown)",
- "\"the brown\"",
- "\"the\"",
- "[the brown]!~2,2",
- "[brown the]!~2,2",
- "the*ter",
- "the?ter"
- };
+ //ditto
+ countSpansDocs(p, "[the the the the jumped the cat the the the the]~1", 0, 0);
+ countSpansDocs(p, "[the the the the over the brown the the the the]~1", 1, 1);
- for (String ex : stopExs) {
- testException(p, ex);
- }
-
// add tests for surprise phrasal with stopword!!! chinese
-
SpanOnlyParser noStopsParser = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, noStopAnalyzer);
noStopsParser.setAutoGeneratePhraseQueries(true);
// won't match because stop word was dropped in index
@@ -421,6 +417,7 @@
testOffsetForSingleSpanMatch(noStopsParser,
"[\u666E \u6797 \u65AF \u987F \u5B66]~2", 6, 0, 6);
+
}
public void testNonWhiteSpaceLanguage() throws Exception {
@@ -483,19 +480,9 @@
SpanOnlyParser stopsParser = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, stopAnalyzer);
stopsParser.setAutoGeneratePhraseQueries(true);
countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
-
- // now test for throwing of exception
- stopsParser.setThrowExceptionForEmptyTerm(true);
- boolean exc = false;
- try {
- countSpansDocs(stopsParser, "\u666E\u6797\u65AF\u987F\u5927\u5B66", 0, 0);
- } catch (ParseException e) {
- exc = true;
- }
- assertEquals(true, exc);
}
- public void testQuotedSingleTerm() throws Exception{
+ public void testQuotedSingleTerm() throws Exception {
SpanOnlyParser p = new SpanOnlyParser(TEST_VERSION_CURRENT, FIELD, noStopAnalyzer);
String[] quoteds = new String[]{
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (revision 1576127)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/spans/TestOverallSpanQueryParser.java (working copy)
@@ -22,8 +22,9 @@
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@@ -57,19 +58,16 @@
@BeforeClass
public static void beforeClass() throws Exception {
- analyzer = new Analyzer() {
- @Override
- public TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE,
- false);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- };
+
+ analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
+
directory = newDirectory();
+
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)
.setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
.setMergePolicy(newLogMergePolicy()));
+
String[] f1Docs = new String[] {
"quick brown AND fox",//0
"quick brown AND dog", //1
@@ -312,4 +310,61 @@
q = parser.parse("abc\\\\*d");
assertTrue(q.toString(), q instanceof SpanMultiTermQueryWrapper);
}
+
+ public void testStops() throws Exception {
+ Analyzer stopsAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+ MockTokenFilter.ENGLISH_STOPSET);
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, stopsAnalyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] {
+ "ab the the cd the the the ef the gh",
+ "ab cd",
+ "ab the ef"
+ };
+
+ for (int i = 0; i < docs.length; i++) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
+ w.addDocument(doc);
+ }
+ IndexReader r = w.getReader();
+ IndexSearcher s = newSearcher(r);
+ w.close();
+ SpanQueryParser p = new SpanQueryParser(TEST_VERSION_CURRENT, FIELD1, stopsAnalyzer);
+ assertHits( "-ab +the +cd", p, s, 0);
+ assertHits( "+ab +the +cd", p, s, 2);
+ assertHits( "+the", p, s, 0);
+ assertHits( "ab AND CD", p, s, 2);
+ assertHits( "ab AND the", p, s, 3);
+ assertHits( "ab OR the", p, s, 3);
+ assertHits( "(ab the cd)~2", p, s, 2);
+ assertHits( "(ab the cd)~3", p, s, 0);
+ assertHits( "ab AND (the OR cd)", p, s, 2);
+ assertHits( "ab AND (the AND cd)", p, s, 2);
+ assertHits( "cd OR (the OR ef)", p, s, 3);
+ assertHits( "cd AND (the AND ef)", p, s, 1);
+ //do we want this behavior?
+ assertHits( "-the", p, s, 0);
+
+ assertHits ("\"ab cd\"", p, s, 1);
+ assertHits ("\"ab a a cd\"", p, s, 2);
+ assertHits ("\"ab a cd\"~1", p, s, 2);
+ assertHits ("\"ab a cd\"~>1", p, s, 2);
+ assertHits ("\"cd a a ab\"", p, s, 0);
+ assertHits ("\"cd a ab\"~1", p, s, 2);
+
+ r.close();
+ dir.close();
+ }
+
+ private void assertHits(String qString, SpanQueryParser p, IndexSearcher s, int expected) throws Exception {
+ Query q = p.parse(qString);
+ TopScoreDocCollector results = TopScoreDocCollector.create(1000, true);
+ s.search(q, results);
+ ScoreDoc[] scoreDocs = results.topDocs().scoreDocs;
+ assertEquals(qString, expected, scoreDocs.length);
+ }
}
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (revision 1576127)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParserBase.java (working copy)
@@ -96,9 +96,6 @@
private int spanNearMaxDistance = 100;
private int spanNotNearMaxDistance = 50;
- //if a full term is analyzed and the analyzer returns nothing,
- //should a ParseException be thrown or should I just ignore the full token.
- private boolean throwExceptionForEmptyTerm = false;
private boolean lowercaseRegex = false;
////////
@@ -654,9 +651,6 @@
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
if (numTokens == 0) {
- if (throwExceptionForEmptyTerm) {
- throw new ParseException("Couldn't find any content term in: "+ termText);
- }
return null;
} else if (numTokens == 1) {
try {
@@ -688,9 +682,7 @@
for (SpanQuery piece : queries) {
if (piece != null) {
nonEmpties.add(piece);
- } else if (piece == null && throwExceptionForEmptyTerm) {
- throw new ParseException("Stop word found in " + termText);
- }
+ }
}
if (nonEmpties.size() == 0) {
@@ -842,7 +834,34 @@
if (clauses == null || clauses.size() == 0)
return getEmptySpanQuery();
- List
- * This is applied only at the full term level.
- *
- * Currently, a parseException is thrown no matter the setting on this
- * whenever an analyzer can't return a value for a multiterm query.
- *
- * @return throw exception if analyzer yields empty term
- */
- public boolean getThrowExceptionForEmptyTerm() {
- return throwExceptionForEmptyTerm;
- }
-
- /**
- * @see #getThrowExceptionForEmptyTerm()
- */
- public void setThrowExceptionForEmptyTerm(boolean throwExceptionForEmptyTerm) {
- this.throwExceptionForEmptyTerm = throwExceptionForEmptyTerm;
- }
-
protected static boolean isCharEscaped(String s, int i) {
int j = i;
int esc = 0;
Index: lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java
===================================================================
--- lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (revision 1576127)
+++ lucene/queryparser/src/java/org/apache/lucene/queryparser/spans/SpanQueryParser.java (working copy)
@@ -33,7 +33,7 @@
import org.apache.lucene.util.Version;
/**
- * This parser leverages the power of SpanQuery and can combine them with
+ * This parser leverages the power of SpanQuery and can combine them with
* traditional boolean logic and multiple field information.
* This parser includes functionality from:
*
@@ -45,14 +45,11 @@
*
*
- *
+ *
* Background * This parser is designed to expose as much of the sophistication as is available within the Query/SpanQuery components. * The basic approach of this parser is to build BooleanQueries comprised of SpanQueries. The parser recursively works * through boolean/fielded chunks and then recursively works through SpanQueries. - *
* ** Goals for this parser: @@ -63,8 +60,7 @@ *
Similarities and Differences
+ *Similarities and Differences * *
Same as classic syntax: *
Main additions in SpanQueryParser syntax vs. classic: *
* Trivial additions: *
Analysis * You can specify different analyzers * to handle whole term versus multiterm components. - *
* ** Using quotes for a single term @@ -125,37 +118,42 @@ * Remember to use quotes or use escapes for anything with backslashes or hyphens: * 12/02/04 (is broken into a term "12", a regex "/02/" and a term "04") * '12/02/04' is treated a a single token. - *
+ * + * *Stop word handling - *
- *The user can choose to throw a {@link org.apache.lucene.queryparser.classic.ParseException} if a stop word is encountered. - * If SpanQueryParserBase.throwExceptionForEmptyTerm is set to false (default), the following should happen. - *
+ *The parser tries to replicate the behavior of the Classic QueryParser. Stop words + * are generally ignored. + *
However, in a "near" query, extra slop is added for each stop word that + * occurs after the first non-stop word and before the last non-stop word (or, initial and trailing stop words + * are ignored in the additions to slop). + * For example, "walked the dog" is converted to "walked dog"~>1 behind the scenes. Like the Classic + * QueryParser this will lead to false positives with any word between "walked" and "dog". Unlike + * Classic QueryParser, this will also lead to false positives of "walked dog". *
+ * Examples + *
*
A parse exception is currently always thrown if the parser analyzes a multiterm, and a subcomponent of the * multiterm has a stopword: the*tre - *
*Expert: Other subtle differences between SpanQueryParser and classic QueryParser. *
Truly Expert: there are a few other very subtle differences that are documented in comments * in the sourcecode in the header of SpanQueryParser. - *
** NOTE You must add the sandbox jar to your class path to include * the currently deprecated {@link org.apache.lucene.sandbox.queries.SlowFuzzyQuery}. - *
*Limitations of SpanQueryParser compared with classic QueryParser: *