Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 606689) +++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -838,19 +838,42 @@ public void testStopwords() throws Exception { QueryParser qp = new QueryParser("a", new StopAnalyzer(new String[]{"the", "foo"})); Query result = qp.parse("a:the OR a:foo"); - assertTrue("result is null and it shouldn't be", result != null); + assertNotNull("result is null and it shouldn't be", result); assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery); assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 0, ((BooleanQuery) result).clauses().size() == 0); result = qp.parse("a:woo OR a:the"); - assertTrue("result is null and it shouldn't be", result != null); + assertNotNull("result is null and it shouldn't be", result); assertTrue("result is not a TermQuery", result instanceof TermQuery); result = qp.parse("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)"); - assertTrue("result is null and it shouldn't be", result != null); + assertNotNull("result is null and it shouldn't be", result); assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery); System.out.println("Result: " + result); assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2); } + public void testPositionIncrement() throws Exception { + boolean dflt = StopFilter.getDefaultApplyPositionIncrements(); + StopFilter.setDefaultApplyPositionIncrements(true); + try { + QueryParser qp = new QueryParser("a", new StopAnalyzer(new String[]{"the", "in", "are", "this"})); + qp.setApplyPositionIncrements(true); + String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; + // 0 2 5 7 8 + int expectedPositions[] = {1,3,4,6,9}; + PhraseQuery pq = (PhraseQuery) qp.parse(qtxt); + //System.out.println("Query text: "+qtxt); + //System.out.println("Result: "+pq); + Term t[] = pq.getTerms(); + int pos[] = pq.getPositions(); + for (int i = 0; i < t.length; i++) { + //System.out.println(i+". "+t[i]+" pos: "+pos[i]); + assertEquals("term "+i+" = "+t[i]+" has wrong term-position!",expectedPositions[i],pos[i]); + } + } finally { + StopFilter.setDefaultApplyPositionIncrements(dflt); + } + } + public void testMatchAllDocs() throws Exception { QueryParser qp = new QueryParser("field", new WhitespaceAnalyzer()); assertEquals(new MatchAllDocsQuery(), qp.parse("*:*")); Index: src/test/org/apache/lucene/analysis/TestStopAnalyzer.java =================================================================== --- src/test/org/apache/lucene/analysis/TestStopAnalyzer.java (revision 606689) +++ src/test/org/apache/lucene/analysis/TestStopAnalyzer.java (working copy) @@ -64,7 +64,33 @@ while ((token = stream.next()) != null) { String text = token.termText(); assertFalse(stopWordsSet.contains(text)); + assertEquals(1,token.getPositionIncrement()); // by default stop tokenizer does not apply increments. } } - + + public void testStopListPositions() throws IOException { + boolean defaultApply = StopFilter.getDefaultApplyPositionIncrements(); + StopFilter.setDefaultApplyPositionIncrements(true); + try { + Set stopWordsSet = new HashSet(); + stopWordsSet.add("good"); + stopWordsSet.add("test"); + stopWordsSet.add("analyzer"); + StopAnalyzer newStop = new StopAnalyzer((String[])stopWordsSet.toArray(new String[3])); + StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions"); + int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1}; + TokenStream stream = newStop.tokenStream("test", reader); + assertNotNull(stream); + Token token = null; + int i = 0; + while ((token = stream.next()) != null) { + String text = token.termText(); + assertFalse(stopWordsSet.contains(text)); + assertEquals(expectedIncr[i++],token.getPositionIncrement()); + } + } finally { + StopFilter.setDefaultApplyPositionIncrements(defaultApply); + } + } + } Index: src/test/org/apache/lucene/analysis/TestStopFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestStopFilter.java (revision 606689) +++ src/test/org/apache/lucene/analysis/TestStopFilter.java (working copy) @@ -16,10 +16,12 @@ * limitations under the License. */ +import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; import java.io.StringReader; +import java.util.ArrayList; import java.util.Set; /** @@ -27,6 +29,8 @@ */ public class TestStopFilter extends LuceneTestCase { + private final static boolean VERBOSE = true; + // other StopFilter functionality is already tested by TestStopAnalyzer public void testExactCase() throws IOException { @@ -56,4 +60,48 @@ assertEquals(null, stream.next()); } + /** + * Test Position increments applied by StopFilter with and without enabling this option. + */ + public void testStopPositons() throws IOException { + StringBuffer sb = new StringBuffer(); + ArrayList a = new ArrayList(); + for (int i=0; i<20; i++) { + String w = English.intToEnglish(i).trim(); + sb.append(w).append(" "); + if (i%3 != 0) a.add(w); + } + log(sb.toString()); + String stopWords[] = (String[]) a.toArray(new String[0]); + for (int i=0; i test with apply-increments-"+(applyIcrements?"enabled":"disabled")); + stpf.setApplyPositionIncrements(applyIcrements); + for (int i=0; i<20; i+=3) { + Token t = stpf.next(); + log("Token "+i+": "+t); + String w = English.intToEnglish(i).trim(); + assertEquals("expecting token "+i+" to be "+w,w,t.termText()); + assertEquals("all but first token must have position increment of 3",t.getPositionIncrement(),applyIcrements?(i==0?1:3):1); + } + assertNull(stpf.next()); + } + + // print debug info depending on VERBOSE + private static void log(String s) { + if (VERBOSE) { + System.out.println(s); + } + } } Index: src/test/org/apache/lucene/search/TestPositionIncrement.java =================================================================== --- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 606689) +++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy) @@ -19,11 +19,14 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; @@ -80,7 +83,21 @@ hits = searcher.search(q); assertEquals(0, hits.length()); + // same as previous, just specify positions explicitely. + q = new PhraseQuery(); + q.add(new Term("field", "1"),0); + q.add(new Term("field", "2"),1); + hits = searcher.search(q); + assertEquals(0, hits.length()); + + // specifying correct positions should find the phrase. q = new PhraseQuery(); + q.add(new Term("field", "1"),0); + q.add(new Term("field", "2"),2); + hits = searcher.search(q); + assertEquals(1, hits.length()); + + q = new PhraseQuery(); q.add(new Term("field", "2")); q.add(new Term("field", "3")); hits = searcher.search(q); @@ -92,7 +109,29 @@ hits = searcher.search(q); assertEquals(0, hits.length()); + // phrase query would find it when correct positions are specified. q = new PhraseQuery(); + q.add(new Term("field", "3"),0); + q.add(new Term("field", "4"),0); + hits = searcher.search(q); + assertEquals(1, hits.length()); + + // phrase query should fail for non existing searched term + // even if there exist another searched terms in the same searched position. + q = new PhraseQuery(); + q.add(new Term("field", "3"),0); + q.add(new Term("field", "9"),0); + hits = searcher.search(q); + assertEquals(0, hits.length()); + + // multi-phrase query should succed for non existing searched term + // because there exist another searched terms in the same searched position. + MultiPhraseQuery mq = new MultiPhraseQuery(); + mq.add(new Term[]{new Term("field", "3"),new Term("field", "9")},0); + hits = searcher.search(mq); + assertEquals(1, hits.length()); + + q = new PhraseQuery(); q.add(new Term("field", "2")); q.add(new Term("field", "4")); hits = searcher.search(q); @@ -115,6 +154,50 @@ q.add(new Term("field", "5")); hits = searcher.search(q); assertEquals(0, hits.length()); + + // analyzer to introduce stopwords and increment gaps + Analyzer stpa = new Analyzer() { + final WhitespaceAnalyzer a = new WhitespaceAnalyzer(); + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream ts = a.tokenStream(fieldName,reader); + return new StopFilter(ts,new String[]{"stop"}); + } + }; + + // should not find "1 2" because there is a gap of 1 in the index + QueryParser qp = new QueryParser("field",stpa); + q = (PhraseQuery) qp.parse("\"1 2\""); + hits = searcher.search(q); + assertEquals(0, hits.length()); + + // omitted stop word cannot help because stop filter swallows the increments. + q = (PhraseQuery) qp.parse("\"1 stop 2\""); + hits = searcher.search(q); + assertEquals(0, hits.length()); + + // query parser alone won't help, because stop filter swallows the increments. + qp.setApplyPositionIncrements(true); + q = (PhraseQuery) qp.parse("\"1 stop 2\""); + hits = searcher.search(q); + assertEquals(0, hits.length()); + + boolean dflt = StopFilter.getDefaultApplyPositionIncrements(); + try { + // stop filter alone won't help, because query parser swallows the increments. + qp.setApplyPositionIncrements(false); + StopFilter.setDefaultApplyPositionIncrements(true); + q = (PhraseQuery) qp.parse("\"1 stop 2\""); + hits = searcher.search(q); + assertEquals(0, hits.length()); + + // when both qp qnd stopFilter propagate increments, we should find the doc. + qp.setApplyPositionIncrements(true); + q = (PhraseQuery) qp.parse("\"1 stop 2\""); + hits = searcher.search(q); + assertEquals(1, hits.length()); + } finally { + StopFilter.setDefaultApplyPositionIncrements(dflt); + } } /** Index: src/java/org/apache/lucene/queryParser/QueryParser.java =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 606689) +++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy) @@ -100,6 +100,7 @@ boolean lowercaseExpandedTerms = true; boolean useOldRangeQuery= false; boolean allowLeadingWildcard = false; + boolean applyPositionIncrements = false; Analyzer analyzer; String field; @@ -234,13 +235,34 @@ } /** - * @see #setAllowLeadingWildcard + * @see #setAllowLeadingWildcard(boolean) */ public boolean getAllowLeadingWildcard() { return allowLeadingWildcard; } /** + * Set to true to apply postion increments to result query. + *

+ * When set, result phrase and multi-phrase queries will + * be aware of position increment. + * Useful when e.g. a StopFilter increases the position increment of + * the token that follows an omitted token. + *

+ * Default: false. + */ + public void setApplyPositionIncrements(boolean applyPositionIncrements) { + this.applyPositionIncrements = applyPositionIncrements; + } + + /** + * @see #setApplyPositionIncrements(boolean) + */ + public boolean getApplyPositionIncrements() { + return applyPositionIncrements; + } + + /** * Sets the boolean operator of the QueryParser. * In default mode (OR_OPERATOR) terms without any modifiers * are considered optional: for example capital of Hungary is equal to @@ -478,27 +500,42 @@ MultiPhraseQuery mpq = new MultiPhraseQuery(); mpq.setSlop(phraseSlop); List multiTerms = new ArrayList(); + int position = -1; for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); - if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) { - mpq.add((Term[])multiTerms.toArray(new Term[0])); + if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) { + if (applyPositionIncrements) { + mpq.add((Term[])multiTerms.toArray(new Term[0]),position); + } else { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + } multiTerms.clear(); } + position += t.getPositionIncrement(); multiTerms.add(new Term(field, t.termText())); } - mpq.add((Term[])multiTerms.toArray(new Term[0])); + if (applyPositionIncrements) { + mpq.add((Term[])multiTerms.toArray(new Term[0]),position); + } else { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + } return mpq; } } else { - PhraseQuery q = new PhraseQuery(); - q.setSlop(phraseSlop); + PhraseQuery pq = new PhraseQuery(); + pq.setSlop(phraseSlop); + int position = -1; for (int i = 0; i < v.size(); i++) { - q.add(new Term(field, ((org.apache.lucene.analysis.Token) - v.elementAt(i)).termText())); - + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + if (applyPositionIncrements) { + position += t.getPositionIncrement(); + pq.add(new Term(field, t.termText()),position); + } else { + pq.add(new Term(field, t.termText())); + } } - return q; + return pq; } } } @@ -1262,12 +1299,6 @@ finally { jj_save(0, xla); } } - final private boolean jj_3R_3() { - if (jj_scan_token(STAR)) return true; - if (jj_scan_token(COLON)) return true; - return false; - } - final private boolean jj_3R_2() { if (jj_scan_token(TERM)) return true; if (jj_scan_token(COLON)) return true; @@ -1284,6 +1315,12 @@ return false; } + final private boolean jj_3R_3() { + if (jj_scan_token(STAR)) return true; + if (jj_scan_token(COLON)) return true; + return false; + } + public QueryParserTokenManager token_source; public Token token, jj_nt; private int jj_ntk; Index: src/java/org/apache/lucene/queryParser/QueryParser.jj =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 606689) +++ src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy) @@ -124,6 +124,7 @@ boolean lowercaseExpandedTerms = true; boolean useOldRangeQuery= false; boolean allowLeadingWildcard = false; + boolean applyPositionIncrements = false; Analyzer analyzer; String field; @@ -258,13 +259,34 @@ } /** - * @see #setAllowLeadingWildcard + * @see #setAllowLeadingWildcard(boolean) */ public boolean getAllowLeadingWildcard() { return allowLeadingWildcard; } /** + * Set to true to apply postion increments to result query. + *

+ * When set, result phrase and multi-phrase queries will + * be aware of position increment. + * Useful when e.g. a StopFilter increases the position increment of + * the token that follows an omitted token. + *

+ * Default: false. + */ + public void setApplyPositionIncrements(boolean applyPositionIncrements) { + this.applyPositionIncrements = applyPositionIncrements; + } + + /** + * @see #setApplyPositionIncrements(boolean) + */ + public boolean getApplyPositionIncrements() { + return applyPositionIncrements; + } + + /** * Sets the boolean operator of the QueryParser. * In default mode (OR_OPERATOR) terms without any modifiers * are considered optional: for example capital of Hungary is equal to @@ -502,27 +524,42 @@ MultiPhraseQuery mpq = new MultiPhraseQuery(); mpq.setSlop(phraseSlop); List multiTerms = new ArrayList(); + int position = -1; for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); - if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) { - mpq.add((Term[])multiTerms.toArray(new Term[0])); + if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) { + if (applyPositionIncrements) { + mpq.add((Term[])multiTerms.toArray(new Term[0]),position); + } else { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + } multiTerms.clear(); } + position += t.getPositionIncrement(); multiTerms.add(new Term(field, t.termText())); } - mpq.add((Term[])multiTerms.toArray(new Term[0])); + if (applyPositionIncrements) { + mpq.add((Term[])multiTerms.toArray(new Term[0]),position); + } else { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + } return mpq; } } else { - PhraseQuery q = new PhraseQuery(); - q.setSlop(phraseSlop); + PhraseQuery pq = new PhraseQuery(); + pq.setSlop(phraseSlop); + int position = -1; for (int i = 0; i < v.size(); i++) { - q.add(new Term(field, ((org.apache.lucene.analysis.Token) - v.elementAt(i)).termText())); - + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + if (applyPositionIncrements) { + position += t.getPositionIncrement(); + pq.add(new Term(field, t.termText()),position); + } else { + pq.add(new Term(field, t.termText())); + } } - return q; + return pq; } } } Index: src/java/org/apache/lucene/analysis/StopFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/StopFilter.java (revision 606689) +++ src/java/org/apache/lucene/analysis/StopFilter.java (working copy) @@ -27,7 +27,10 @@ public final class StopFilter extends TokenFilter { + private static boolean APPLY_POSITION_INCREMENTS_DEFAULT = false; + private final CharArraySet stopWords; + private boolean applyPositionIncrements = APPLY_POSITION_INCREMENTS_DEFAULT; /** * Construct a token stream filtering the given input. @@ -111,11 +114,58 @@ */ public final Token next(Token result) throws IOException { // return the first non-stop word found + int skippedPositions = 0; while((result = input.next(result)) != null) { - if (!stopWords.contains(result.termBuffer(), 0, result.termLength)) + if (!stopWords.contains(result.termBuffer(), 0, result.termLength)) { + if (applyPositionIncrements) { + result.setPositionIncrement(result.getPositionIncrement() + skippedPositions); + } return result; + } + skippedPositions += result.getPositionIncrement(); } // reached EOS -- return null return null; } + + /** + * @see #setDefaultApplyPositionIncrements(boolean). + */ + public static boolean getDefaultApplyPositionIncrements() { + return APPLY_POSITION_INCREMENTS_DEFAULT; + } + + /** + * Set the default position increments behavior of every StopFilter created from now on. + *

+ * Note: behavior of a sngle StopFilter instance can be modified + * with {@link #setApplyPositionIncrements(boolean)}. + * This also gives control over behavior classes using StopFilters internally, + * for example {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer}. + *

+ * Default : false. + * @see #setApplyPositionIncrements(boolean). + */ + public static void setDefaultApplyPositionIncrements(boolean defaultApplyPositionIncrements) { + APPLY_POSITION_INCREMENTS_DEFAULT = defaultApplyPositionIncrements; + } + + /** + * @see #setApplyPositionIncrements(boolean). + */ + public boolean getApplyPositionIncrements() { + return applyPositionIncrements; + } + + /** + * Set to true to make this StopFilter apply position increments to result tokens. + *

+ * When set, when a token is stopped (omitted), the position increment of + * the following token is incremented. + *

+ * Default: see {@link #setDefaultApplyPositionIncrements(boolean)}. + */ + public void setApplyPositionIncrements(boolean applyPositionIncrements) { + this.applyPositionIncrements = applyPositionIncrements; + } }