Index: lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java (revision 1514517) +++ lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java (working copy) @@ -31,16 +31,19 @@ */ public final class BinaryTokenStream extends TokenStream { private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class); + private final BytesRef bytes; private boolean available = true; public BinaryTokenStream(BytesRef bytes) { - bytesAtt.setBytesRef(bytes); + this.bytes = bytes; } @Override public boolean incrementToken() { if (available) { + clearAttributes(); available = false; + bytesAtt.setBytesRef(bytes); return true; } return false; Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 1514517) +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -50,6 +50,7 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; @@ -72,6 +73,9 @@ import org.apache.lucene.util.SetOnce; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.packed.PackedInts; import org.junit.Test; @@ -1900,6 +1904,65 @@ } } + // LUCENE-3849 + public void testStopwordsPosIncHole() throws Exception { + Directory dir = newDirectory(); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader); + TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET); + return new TokenStreamComponents(tokenizer, stream); + } + }; + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a); + Document doc = new Document(); + doc.add(new TextField("body", "just a", Field.Store.NO)); + doc.add(new TextField("body", "test of gaps", Field.Store.NO)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("body", "just"), 0); + pq.add(new Term("body", "test"), 2); + // body:"just ? test" + assertEquals(1, is.search(pq, 5).totalHits); + ir.close(); + dir.close(); + } + + // LUCENE-3849 + public void testStopwordsPosIncHole2() throws Exception { + // use two stopfilters for testing here + Directory dir = newDirectory(); + final Automaton secondSet = BasicAutomata.makeString("foobar"); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader); + TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET); + stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet)); + return new TokenStreamComponents(tokenizer, stream); + } + }; + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a); + Document doc = new Document(); + doc.add(new TextField("body", "just a foobar", Field.Store.NO)); + doc.add(new TextField("body", "test of gaps", Field.Store.NO)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("body", "just"), 0); + pq.add(new Term("body", "test"), 3); + // body:"just ? ? test" + assertEquals(1, is.search(pq, 5).totalHits); + ir.close(); + dir.close(); + } + // here we do better, there is no current segments file, so we don't delete anything. // however, if you actually go and make a commit, the next time you run indexwriter // this file will be gone. Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java (revision 1514517) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -21,6 +21,7 @@ import java.io.Closeable; import java.lang.reflect.Modifier; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; @@ -159,11 +160,18 @@ * setting the final offset of a stream. The final offset of a stream might * differ from the offset of the last token eg in case one or more whitespaces * followed after the last token, but a WhitespaceTokenizer was used. + *
+ * Additionally any skipped positions (such as those removed by a stopfilter)
+ * can be applied to the position increment, or any adjustment of other
+ * attributes where the end-of-stream value may be important.
*
* @throws IOException If an I/O error occurs
*/
public void end() throws IOException {
- // do nothing by default
+ clearAttributes(); // LUCENE-3849: don't consume dirty atts
+ if (hasAttribute(PositionIncrementAttribute.class)) {
+ getAttribute(PositionIncrementAttribute.class).setPositionIncrement(0);
+ }
}
/**
Index: lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java (revision 1514517)
+++ lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java (working copy)
@@ -175,7 +175,10 @@
}
// trigger streams to perform end-of-stream operations
stream.end();
-
+ // TODO: maybe add some safety? then again, its already checked
+ // when we come back around to the field...
+ // nocommit
+ fieldState.position += posIncrAttribute.getPositionIncrement();
fieldState.offset += offsetAttribute.endOffset();
success2 = true;
} finally {
Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (revision 1514517)
+++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (working copy)
@@ -264,7 +264,8 @@
}
@Override
- public final void end(){
+ public final void end() throws IOException {
+ super.end();
offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
}
Index: lucene/facet/src/java/org/apache/lucene/facet/index/DrillDownStream.java
===================================================================
--- lucene/facet/src/java/org/apache/lucene/facet/index/DrillDownStream.java (revision 1514517)
+++ lucene/facet/src/java/org/apache/lucene/facet/index/DrillDownStream.java (working copy)
@@ -49,7 +49,7 @@
// a hook for AssociationsDrillDownStream to add the associations payload to
// the drill-down terms
}
-
+
@Override
public final boolean incrementToken() throws IOException {
if (current.length == 0) {
Index: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
===================================================================
--- lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java (revision 1514517)
+++ lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java (working copy)
@@ -555,12 +555,16 @@
private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private boolean returned;
+ private int val;
+ private final String word;
+
public SinglePositionTokenStream(String word) {
termAtt = addAttribute(CharTermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- termAtt.setEmpty().append(word);
+ this.word = word;
returned = true;
}
+
/**
* Set the value we want to keep, as the position increment.
* Note that when TermPositions.nextPosition() is later used to
@@ -574,15 +578,21 @@
* This change is described in Lucene's JIRA: LUCENE-1542.
*/
public void set(int val) {
- posIncrAtt.setPositionIncrement(val);
+ this.val = val;
returned = false;
}
+
@Override
public boolean incrementToken() throws IOException {
if (returned) {
return false;
}
- return returned = true;
+ clearAttributes();
+ posIncrAtt.setPositionIncrement(val);
+ termAtt.setEmpty();
+ termAtt.append(word);
+ returned = true;
+ return true;
}
}
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (revision 1514517)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (working copy)
@@ -255,16 +255,33 @@
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
}
}
+
if (ts.incrementToken()) {
fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString());
}
+
+ // repeat our extra safety checks for end()
+ ts.clearAttributes();
+ if (termAtt != null) termAtt.setEmpty().append("bogusTerm");
+ if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
+ if (typeAtt != null) typeAtt.setType("bogusType");
+ if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
+ if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
+
+ checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
+
ts.end();
+ assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled());
+
if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
}
if (offsetAtt != null) {
assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
}
+
+ // nocommit final pos inc too?
+
ts.close();
}
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (revision 1514517)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (working copy)
@@ -244,6 +244,7 @@
@Override
public void end() throws IOException {
+ super.end();
int finalOffset = correctOffset(off);
offsetAtt.setOffset(finalOffset, finalOffset);
// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (revision 1514517)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (working copy)
@@ -58,7 +58,8 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
+ private int skippedPositions;
+
/**
* Create a new MockTokenFilter.
*
@@ -76,7 +77,7 @@
// initial token with posInc=0 ever
// return the first non-stop word found
- int skippedPositions = 0;
+ skippedPositions = 0;
while (input.incrementToken()) {
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
@@ -87,4 +88,16 @@
// reached EOS -- return false
return false;
}
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ skippedPositions = 0;
+ }
}
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java (revision 1514517)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java (working copy)
@@ -50,7 +50,6 @@
private final CharArraySet stopWords;
private State endState;
- private boolean ended;
/** Sole constructor. */
public SuggestStopFilter(TokenStream input, CharArraySet stopWords) {
@@ -61,28 +60,24 @@
@Override
public void reset() throws IOException {
super.reset();
- ended = false;
endState = null;
}
@Override
public void end() throws IOException {
- if (!ended) {
+ if (endState == null) {
super.end();
} else {
// NOTE: we already called .end() from our .next() when
// the stream was complete, so we do not call
// super.end() here
-
- if (endState != null) {
- restoreState(endState);
- }
+ restoreState(endState);
}
}
@Override
public boolean incrementToken() throws IOException {
- if (ended) {
+ if (endState != null) {
return false;
}
@@ -101,8 +96,9 @@
// It was a stopword; skip it
skippedPositions += posInc;
} else {
+ clearAttributes();
input.end();
- ended = true;
+ endState = captureState();
int finalEndOffset = offsetAtt.endOffset();
assert finalEndOffset >= endOffset;
if (finalEndOffset > endOffset) {
@@ -112,7 +108,6 @@
} else {
// No token separator after final token that
// looked like a stop-word; don't filter it:
- endState = captureState();
restoreState(sav);
posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement());
keywordAtt.setKeyword(true);
Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
===================================================================
--- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (revision 1514517)
+++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (working copy)
@@ -120,7 +120,8 @@
}
@Override
- public void end() {
+ public void end() throws IOException {
+ super.end();
final int finalOffset = (length < 0) ? offset : offset + length;
offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 1514517)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy)
@@ -80,7 +80,8 @@
}
@Override
- public final void end() {
+ public final void end() throws IOException {
+ super.end();
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (revision 1514517)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (working copy)
@@ -309,7 +309,8 @@
}
@Override
- public void end() {
+ public void end() throws IOException {
+ super.end();
// set final offset
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
this.offsetAtt.setOffset(finalOffset, finalOffset);
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java (revision 1514517)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java (working copy)
@@ -176,7 +176,8 @@
}
@Override
- public final void end() {
+ public final void end() throws IOException {
+ super.end();
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java (revision 1514517)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java (working copy)
@@ -191,7 +191,8 @@
}
@Override
- public final void end() {
+ public final void end() throws IOException {
+ super.end();
// set final offset
int finalOffset = correctOffset(charsRead);
offsetAtt.setOffset(finalOffset, finalOffset);
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (revision 1514517)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (working copy)
@@ -76,6 +76,8 @@
"