Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 779312) +++ CHANGES.txt (working copy) @@ -186,6 +186,15 @@ 10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause the segment's deletion count to be incorrect. (Mike McCandless) +11. LUCENE-1542: When the first token(s) of a field have zero position + increment, IndexWriter used to incorrectly record the position of + such tokens as -1, if no payload is present, or Integer.MAX_VALUE + if a payload is present. This causes problems for *SpanQuery (at + least). We've fixed this to consistently record position 0, but + if you rely on the old buggy behavior you should call the + deprecated IndexWriter. setAllowMinus1Position (). (Jonathan Mamou, + Mark Miller via Mike McCandless) + New features 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior Index: src/test/org/apache/lucene/search/TestPositionIncrement.java =================================================================== --- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 779312) +++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy) @@ -17,8 +17,11 @@ * limitations under the License. */ +import java.io.Reader; import java.io.IOException; -import java.io.Reader; +import java.io.StringReader; +import java.util.Collection; +import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; @@ -26,14 +29,27 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.index.Payload; +import org.apache.lucene.search.payloads.PayloadSpanUtil; +import org.apache.lucene.search.spans.PayloadSpans; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; /** * Term position unit test. @@ -48,7 +64,7 @@ public TokenStream tokenStream(String fieldName, Reader reader) { return new TokenStream() { private final String[] TOKENS = {"1", "2", "3", "4", "5"}; - private final int[] INCREMENTS = {1, 2, 1, 0, 1}; + private final int[] INCREMENTS = {0, 2, 1, 0, 1}; private int i = 0; PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); @@ -67,7 +83,7 @@ }; } }; - RAMDirectory store = new RAMDirectory(); + Directory store = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(store, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document d = new Document(); @@ -75,8 +91,20 @@ writer.addDocument(d); writer.optimize(); writer.close(); + IndexSearcher searcher = new IndexSearcher(store); + + TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1")); + pos.next(); + // first token should be at position 0 + assertEquals(0, pos.nextPosition()); + + pos = searcher.getIndexReader().termPositions(new Term("field", "2")); + pos.next(); + // second token should be at position 2 + assertEquals(2, pos.nextPosition()); + PhraseQuery q; ScoreDoc[] hits; @@ -202,4 +230,146 @@ StopFilter.setEnablePositionIncrementsDefault(dflt); } } + + public void testPayloadsPos0() throws Exception { + for(int x=0;x<2;x++) { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, + new TestPayloadAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + if (x == 1) { + writer.setAllowMinus1Position(); + } + Document doc = new Document(); + doc.add(new Field("content", + new StringReader("a a b c d e a f g h i j a b k k"))); + writer.addDocument(doc); + + IndexReader r = writer.getReader(); + + TermPositions tp = r.termPositions(new Term("content", "a")); + int count = 0; + assertTrue(tp.next()); + // "a" occurs 4 times + assertEquals(4, tp.freq()); + int expected; + if (x == 1) { + expected = Integer.MAX_VALUE; + } else { + expected = 0; + } + assertEquals(expected, tp.nextPosition()); + if (x == 1) { + continue; + } + assertEquals(1, tp.nextPosition()); + assertEquals(3, tp.nextPosition()); + assertEquals(6, tp.nextPosition()); + + // only one doc has "a" + assertFalse(tp.next()); + + IndexSearcher is = new IndexSearcher(r); + + SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); + SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); + SpanQuery[] sqs = { stq1, stq2 }; + SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); + + count = 0; + boolean sawZero = false; + //System.out.println("\ngetPayloadSpans test"); + PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader()); + while (pspans.next()) { + //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end()); + Collection payloads = pspans.getPayload(); + sawZero |= pspans.start() == 0; + for (Iterator it = payloads.iterator(); it.hasNext();) { + count++; + it.next(); + //System.out.println(new String((byte[]) it.next())); + } + } + assertEquals(5, count); + assertTrue(sawZero); + + //System.out.println("\ngetSpans test"); + Spans spans = snq.getSpans(is.getIndexReader()); + count = 0; + sawZero = false; + while (spans.next()) { + count++; + sawZero |= spans.start() == 0; + //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end()); + } + assertEquals(4, count); + assertTrue(sawZero); + + //System.out.println("\nPayloadSpanUtil test"); + + sawZero = false; + PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader()); + Collection pls = psu.getPayloadsForQuery(snq); + count = pls.size(); + for (Iterator it = pls.iterator(); it.hasNext();) { + String s = new String((byte[]) it.next()); + //System.out.println(s); + sawZero |= s.equals("pos: 0"); + } + assertEquals(5, count); + assertTrue(sawZero); + writer.close(); + is.getIndexReader().close(); + dir.close(); + } + } } + +class TestPayloadAnalyzer extends Analyzer { + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new LowerCaseTokenizer(reader); + return new PayloadFilter(result, fieldName); + } +} + +class PayloadFilter extends TokenFilter { + String fieldName; + + int pos; + + int i; + + final PositionIncrementAttribute posIncrAttr; + final PayloadAttribute payloadAttr; + final TermAttribute termAttr; + + public PayloadFilter(TokenStream input, String fieldName) { + super(input); + this.fieldName = fieldName; + pos = 0; + i = 0; + posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class); + payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class); + termAttr = (TermAttribute) input.addAttribute(TermAttribute.class); + } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes())); + int posIncr; + if (i % 2 == 1) { + posIncr = 1; + } else { + posIncr = 0; + } + posIncrAttr.setPositionIncrement(posIncr); + pos += posIncr; + // System.out.println("term=" + termAttr.term() + " pos=" + pos); + i++; + return true; + } else { + return false; + } + } +} Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 779312) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -3594,7 +3594,7 @@ TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a")); assertTrue(tps.next()); assertEquals(1, tps.freq()); - assertEquals(-1, tps.nextPosition()); + assertEquals(0, tps.nextPosition()); w.close(); assertTrue(_TestUtil.checkIndex(dir)); Index: src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 779312) +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -126,6 +126,9 @@ // reset the TokenStream to the first token stream.reset(); + // deprecated + final boolean allowMinus1Position = docState.allowMinus1Position; + try { int offsetEnd = fieldState.offset-1; @@ -162,7 +165,11 @@ } final int posIncr = posIncrAttribute.getPositionIncrement(); - fieldState.position += posIncr - 1; + fieldState.position += posIncr; + if (allowMinus1Position || fieldState.position > 0) { + fieldState.position--; + } + if (posIncr == 0) fieldState.numOverlap++; Index: src/java/org/apache/lucene/index/DocumentsWriterThreadState.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (revision 779312) +++ src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (working copy) @@ -40,6 +40,7 @@ docState.infoStream = docWriter.infoStream; docState.similarity = docWriter.similarity; docState.docWriter = docWriter; + docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position(); consumer = docWriter.consumer.addThread(this); } Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 779312) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -150,6 +150,9 @@ Document doc; String maxTermPrefix; + // deprecated + boolean allowMinus1Position; + // Only called by asserts public boolean testPoint(String name) { return docWriter.writer.testPoint(name); @@ -298,6 +301,11 @@ threadStates[i].docState.similarity = similarity; } + synchronized void setAllowMinus1Position() { + for(int i=0;i