Index: modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java =================================================================== --- modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java (revision 1231394) +++ modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java (working copy) @@ -53,7 +53,7 @@ this.buffer = buffer; // TODO (Facet): avoid Multi*? Bits liveDocs = MultiFields.getLiveDocs(indexReader); - this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes()); + this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), false); } /** Index: modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ParentArray.java =================================================================== --- modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ParentArray.java (revision 1231394) +++ modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ParentArray.java (working copy) @@ -104,7 +104,8 @@ // TODO (Facet): avoid Multi*? Bits liveDocs = MultiFields.getLiveDocs(indexReader); DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(indexReader, liveDocs, - Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT)); + Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT), + false); if ((positions == null || positions.advance(first) == DocsAndPositionsEnum.NO_MORE_DOCS) && first < num) { throw new CorruptIndexException("Missing parent data for category " + first); } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestClassicAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestClassicAnalyzer.java (revision 1231394) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestClassicAnalyzer.java (working copy) @@ -283,7 +283,8 @@ DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", - new BytesRef("another")); + new BytesRef("another"), + false); assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(1, tps.freq()); assertEquals(3, tps.nextPosition()); Index: modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java (revision 1231394) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java (working copy) @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -110,16 +109,15 @@ TermsEnum termsEnum = vector.iterator(null); termsEnum.next(); assertEquals(2, termsEnum.totalTermFreq()); - DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = positions.attributes().getAttribute(OffsetAttribute.class); + DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null, true); assertTrue(positions.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(2, positions.freq()); positions.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, positions.startOffset()); + assertEquals(4, positions.endOffset()); positions.nextPosition(); - assertEquals(8, offsetAtt.startOffset()); - assertEquals(12, offsetAtt.endOffset()); + assertEquals(8, positions.startOffset()); + assertEquals(12, positions.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, positions.nextDoc()); r.close(); dir.close(); Index: lucene/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy) @@ -74,7 +74,8 @@ DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "preanalyzed", - new BytesRef("term1")); + new BytesRef("term1"), + false); assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS); assertEquals(1, termPositions.freq()); assertEquals(0, termPositions.nextPosition()); @@ -82,7 +83,8 @@ termPositions = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "preanalyzed", - new BytesRef("term2")); + new BytesRef("term2"), + false); assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS); assertEquals(2, termPositions.freq()); assertEquals(1, termPositions.nextPosition()); @@ -91,7 +93,8 @@ termPositions = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "preanalyzed", - new BytesRef("term3")); + new BytesRef("term3"), + false); assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS); assertEquals(1, termPositions.freq()); assertEquals(2, termPositions.nextPosition()); Index: lucene/src/test/org/apache/lucene/codecs/pulsing/TestPulsingReuse.java =================================================================== --- lucene/src/test/org/apache/lucene/codecs/pulsing/TestPulsingReuse.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/codecs/pulsing/TestPulsingReuse.java (working copy) @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat; -import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; @@ -70,7 +69,7 @@ DocsAndPositionsEnum posReuse = null; te = segment.terms("foo").iterator(null); while (te.next() != null) { - posReuse = te.docsAndPositions(null, posReuse); + posReuse = te.docsAndPositions(null, posReuse, false); allEnums.put(posReuse, true); } @@ -112,7 +111,7 @@ DocsAndPositionsEnum posReuse = null; te = segment.terms("foo").iterator(null); while (te.next() != null) { - posReuse = te.docsAndPositions(null, posReuse); + posReuse = te.docsAndPositions(null, posReuse, false); allEnums.put(posReuse, true); } Index: lucene/src/test/org/apache/lucene/search/TestTermVectors.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestTermVectors.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/search/TestTermVectors.java (working copy) @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -135,19 +134,19 @@ assertNotNull(terms); TermsEnum termsEnum = terms.iterator(null); assertEquals("content", termsEnum.next().utf8ToString()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, false); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(1, dpEnum.freq()); assertEquals(expectedPositions[0], dpEnum.nextPosition()); assertEquals("here", termsEnum.next().utf8ToString()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, false); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(1, dpEnum.freq()); assertEquals(expectedPositions[1], dpEnum.nextPosition()); assertEquals("some", termsEnum.next().utf8ToString()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, false); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(1, dpEnum.freq()); assertEquals(expectedPositions[2], dpEnum.nextPosition()); @@ -171,31 +170,21 @@ TermsEnum termsEnum = vectors.terms("field").iterator(null); assertNotNull(termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - OffsetAttribute offsetAtt = dpEnum == null ? null : dpEnum.attributes().getAttribute(OffsetAttribute.class); boolean shouldBePosVector = hits[i].doc % 2 == 0; - assertTrue(!shouldBePosVector - || (shouldBePosVector && dpEnum != null)); - boolean shouldBeOffVector = hits[i].doc % 3 == 0; - assertTrue(!shouldBeOffVector - || (shouldBeOffVector && offsetAtt != null)); if (shouldBePosVector || shouldBeOffVector) { while(true) { - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, shouldBeOffVector); assertNotNull(dpEnum); - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); + assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); - if (shouldBePosVector) { - assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); - } - + dpEnum.nextPosition(); + if (shouldBeOffVector) { - assertNotNull(offsetAtt); - } else { - assertNull(offsetAtt); + assertTrue(dpEnum.startOffset() != -1); + assertTrue(dpEnum.endOffset() != -1); } if (termsEnum.next() == null) { @@ -437,7 +426,7 @@ assertNotNull(termsEnum.next()); assertEquals("one", termsEnum.term().utf8ToString()); assertEquals(5, termsEnum.totalTermFreq()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false); assertNotNull(dpEnum); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(5, dpEnum.freq()); @@ -445,16 +434,14 @@ assertEquals(i, dpEnum.nextPosition()); } - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); assertNotNull(dpEnum); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(5, dpEnum.freq()); for(int i=0;i<5;i++) { dpEnum.nextPosition(); - assertEquals(4*i, offsetAtt.startOffset()); - assertEquals(4*i+3, offsetAtt.endOffset()); + assertEquals(4*i, dpEnum.startOffset()); + assertEquals(4*i+3, dpEnum.endOffset()); } reader.close(); } Index: lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (working copy) @@ -17,37 +17,39 @@ * limitations under the License. */ -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.TermContext; +import java.io.IOException; +import java.io.Reader; +import java.util.Collection; +import java.util.LinkedList; + import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CannedAnalyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; -import java.io.IOException; -import java.util.Collection; -import java.util.LinkedList; -import java.io.Reader; - /** * This class tests the MultiPhraseQuery class. * @@ -329,68 +331,18 @@ indexStore.close(); } - private static class TokenAndPos { - public final String token; - public final int pos; - public TokenAndPos(String token, int pos) { - this.token = token; - this.pos = pos; - } - } - - private static class CannedAnalyzer extends Analyzer { - private final TokenAndPos[] tokens; - - public CannedAnalyzer(TokenAndPos[] tokens) { - this.tokens = tokens; - } - - @Override - public TokenStreamComponents createComponents(String fieldName, Reader reader) { - return new TokenStreamComponents(new CannedTokenizer(tokens)); - } - } - - private static class CannedTokenizer extends Tokenizer { - private final TokenAndPos[] tokens; - private int upto = 0; - private int lastPos = 0; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - - public CannedTokenizer(TokenAndPos[] tokens) { - this.tokens = tokens; - } - - @Override - public final boolean incrementToken() throws IOException { - clearAttributes(); - if (upto < tokens.length) { - final TokenAndPos token = tokens[upto++]; - termAtt.setEmpty(); - termAtt.append(token.token); - posIncrAtt.setPositionIncrement(token.pos - lastPos); - lastPos = token.pos; - return true; - } else { - return false; - } - } - - @Override - public void reset() throws IOException { - super.reset(); - this.upto = 0; - this.lastPos = 0; - } - } - public void testZeroPosIncr() throws IOException { Directory dir = new RAMDirectory(); - final TokenAndPos[] tokens = new TokenAndPos[3]; - tokens[0] = new TokenAndPos("a", 0); - tokens[1] = new TokenAndPos("b", 0); - tokens[2] = new TokenAndPos("c", 0); + final Token[] tokens = new Token[3]; + tokens[0] = new Token(); + tokens[0].append("a"); + tokens[0].setPositionIncrement(1); + tokens[1] = new Token(); + tokens[1].append("b"); + tokens[1].setPositionIncrement(0); + tokens[2] = new Token(); + tokens[2].append("c"); + tokens[2].setPositionIncrement(0); RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens)); Document doc = new Document(); @@ -429,40 +381,47 @@ dir.close(); } - private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] { - new TokenAndPos("x", 0), - new TokenAndPos("a", 1), - new TokenAndPos("1", 1), - new TokenAndPos("m", 2), // not existing, relying on slop=2 - new TokenAndPos("b", 3), - new TokenAndPos("1", 3), - new TokenAndPos("n", 4), // not existing, relying on slop=2 - new TokenAndPos("c", 5), - new TokenAndPos("y", 6) + private static Token makeToken(String text, int posIncr) { + final Token t = new Token(); + t.append(text); + t.setPositionIncrement(posIncr); + return t; + } + + private final static Token[] INCR_0_DOC_TOKENS = new Token[] { + makeToken("x", 1), + makeToken("a", 1), + makeToken("1", 0), + makeToken("m", 1), // not existing, relying on slop=2 + makeToken("b", 1), + makeToken("1", 0), + makeToken("n", 1), // not existing, relying on slop=2 + makeToken("c", 1), + makeToken("y", 1) }; - private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] { - new TokenAndPos("a", 0), - new TokenAndPos("1", 0), - new TokenAndPos("b", 1), - new TokenAndPos("1", 1), - new TokenAndPos("c", 2) + private final static Token[] INCR_0_QUERY_TOKENS_AND = new Token[] { + makeToken("a", 1), + makeToken("1", 0), + makeToken("b", 1), + makeToken("1", 0), + makeToken("c", 1) }; - private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] { - { new TokenAndPos("a", 0) }, - { new TokenAndPos("x", 0), new TokenAndPos("1", 0) }, - { new TokenAndPos("b", 1) }, - { new TokenAndPos("x", 1), new TokenAndPos("1", 1) }, - { new TokenAndPos("c", 2) } + private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new Token[][] { + { makeToken("a", 1) }, + { makeToken("x", 1), makeToken("1", 0) }, + { makeToken("b", 2) }, + { makeToken("x", 2), makeToken("1", 0) }, + { makeToken("c", 3) } }; - private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] { - { new TokenAndPos("x", 0) }, - { new TokenAndPos("a", 0), new TokenAndPos("1", 0) }, - { new TokenAndPos("x", 1) }, - { new TokenAndPos("b", 1), new TokenAndPos("1", 1) }, - { new TokenAndPos("c", 2) } + private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new Token[][] { + { makeToken("x", 1) }, + { makeToken("a", 1), makeToken("1", 0) }, + { makeToken("x", 2) }, + { makeToken("b", 2), makeToken("1", 0) }, + { makeToken("c", 3) } }; /** @@ -515,8 +474,10 @@ */ public void testZeroPosIncrSloppyPqAnd() throws IOException { final PhraseQuery pq = new PhraseQuery(); - for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) { - pq.add(new Term("field",tap.token), tap.pos); + int pos = -1; + for (Token tap : INCR_0_QUERY_TOKENS_AND) { + pos += tap.getPositionIncrement(); + pq.add(new Term("field",tap.toString()), pos); } doTestZeroPosIncrSloppy(pq, 0); pq.setSlop(1); @@ -530,8 +491,10 @@ */ public void testZeroPosIncrSloppyMpqAnd() throws IOException { final MultiPhraseQuery mpq = new MultiPhraseQuery(); - for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) { - mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic + int pos = -1; + for (Token tap : INCR_0_QUERY_TOKENS_AND) { + pos += tap.getPositionIncrement(); + mpq.add(new Term[]{new Term("field",tap.toString())}, pos); //AND logic } doTestZeroPosIncrSloppy(mpq, 0); mpq.setSlop(1); @@ -545,9 +508,9 @@ */ public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException { final MultiPhraseQuery mpq = new MultiPhraseQuery(); - for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) { + for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) { Term[] terms = tapTerms(tap); - final int pos = tap[0].pos; + final int pos = tap[0].getPositionIncrement()-1; mpq.add(terms, pos); //AND logic in pos, OR across lines } doTestZeroPosIncrSloppy(mpq, 0); @@ -562,9 +525,9 @@ */ public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException { final MultiPhraseQuery mpq = new MultiPhraseQuery(); - for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) { + for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) { Term[] terms = tapTerms(tap); - final int pos = tap[0].pos; + final int pos = tap[0].getPositionIncrement()-1; mpq.add(terms, pos); //AND logic in pos, OR across lines } doTestZeroPosIncrSloppy(mpq, 0); @@ -572,10 +535,10 @@ doTestZeroPosIncrSloppy(mpq, 0); } - private Term[] tapTerms(TokenAndPos[] tap) { + private Term[] tapTerms(Token[] tap) { Term[] terms = new Term[tap.length]; for (int i=0; i docID -> tokens + final Map>> actualTokens = new HashMap>>(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir); + + final int numDocs = atLeast(20); + //final int numDocs = atLeast(5); + + FieldType ft = new FieldType(TextField.TYPE_UNSTORED); + + // TODO: randomize what IndexOptions we use; also test + // changing this up in one IW buffered segment...: + ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + + for(int docCount=0;docCount tokens = new ArrayList(); + final int numTokens = atLeast(100); + //final int numTokens = atLeast(20); + int pos = -1; + int offset = 0; + //System.out.println("doc id=" + docCount); + for(int tokenCount=0;tokenCount>()); + } + final Map> postingsByDoc = actualTokens.get(text); + if (!postingsByDoc.containsKey(docCount)) { + postingsByDoc.put(docCount, new ArrayList()); + } + postingsByDoc.get(docCount).add(token); + tokens.add(token); + pos += posIncr; + // stuff abs position into type: + token.setType(""+pos); + offset += offIncr + tokenOffset; + //System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")"); + } + doc.add(new Field("content", new CannedAnalyzer.CannedTokenizer(tokens.toArray(new Token[tokens.size()])), ft)); + w.addDocument(doc); + } + final IndexReader r = w.getReader(); + w.close(); + + final String[] terms = new String[] {"a", "b", "c", "d"}; + for(IndexReader sub : r.getSequentialSubReaders()) { + //System.out.println("\nsub=" + sub); + final TermsEnum termsEnum = sub.fields().terms("content").iterator(null); + DocsEnum docs = null; + DocsAndPositionsEnum docsAndPositions = null; + DocsAndPositionsEnum docsAndPositionsAndOffsets = null; + final int docIDToID[] = FieldCache.DEFAULT.getInts(sub, "id", false); + for(String term : terms) { + //System.out.println(" term=" + term); + if (termsEnum.seekExact(new BytesRef(term), random.nextBoolean())) { + docs = termsEnum.docs(null, docs, true); + assertNotNull(docs); + int doc; + //System.out.println(" doc/freq"); + while((doc = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + final List expected = actualTokens.get(term).get(docIDToID[doc]); + //System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq"); + assertNotNull(expected); + assertEquals(expected.size(), docs.freq()); + } + + docsAndPositions = termsEnum.docsAndPositions(null, docsAndPositions, false); + assertNotNull(docsAndPositions); + //System.out.println(" doc/freq/pos"); + while((doc = docsAndPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + final List expected = actualTokens.get(term).get(docIDToID[doc]); + //System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq"); + assertNotNull(expected); + assertEquals(expected.size(), docsAndPositions.freq()); + for(Token token : expected) { + int pos = Integer.parseInt(token.type()); + //System.out.println(" pos=" + pos); + assertEquals(pos, docsAndPositions.nextPosition()); + } + } + + docsAndPositionsAndOffsets = termsEnum.docsAndPositions(null, docsAndPositions, true); + assertNotNull(docsAndPositionsAndOffsets); + //System.out.println(" doc/freq/pos/offs"); + while((doc = docsAndPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + final List expected = actualTokens.get(term).get(docIDToID[doc]); + //System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq"); + assertNotNull(expected); + assertEquals(expected.size(), docsAndPositions.freq()); + for(Token token : expected) { + int pos = Integer.parseInt(token.type()); + //System.out.println(" pos=" + pos); + assertEquals(pos, docsAndPositions.nextPosition()); + assertEquals(token.startOffset(), docsAndPositions.startOffset()); + assertEquals(token.endOffset(), docsAndPositions.endOffset()); + } + } + } + } + // TODO: test advance: + } + r.close(); + dir.close(); + } + + private Token makeToken(String text, int posIncr, int startOffset, int endOffset) { + final Token t = new Token(); + t.append(text); + t.setPositionIncrement(posIncr); + t.setOffset(startOffset, endOffset); + return t; + } +} Property changes on: lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/src/test/org/apache/lucene/index/TestFilterIndexReader.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestFilterIndexReader.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestFilterIndexReader.java (working copy) @@ -90,8 +90,8 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - return new TestPositions(super.docsAndPositions(liveDocs, reuse == null ? null : ((FilterDocsAndPositionsEnum) reuse).in)); + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + return new TestPositions(super.docsAndPositions(liveDocs, reuse == null ? null : ((FilterDocsAndPositionsEnum) reuse).in, needsOffsets)); } } @@ -166,7 +166,7 @@ assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(new BytesRef("one"))); DocsAndPositionsEnum positions = terms.docsAndPositions(MultiFields.getLiveDocs(reader), - null); + null, false); while (positions.nextDoc() != DocsEnum.NO_MORE_DOCS) { assertTrue((positions.docID() % 2) == 1); } Index: lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java (working copy) @@ -156,7 +156,8 @@ DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), this.field, - new BytesRef("b")); + new BytesRef("b"), + false); for (int i = 0; i < 10; i++) { tp.nextDoc(); @@ -167,7 +168,8 @@ tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), this.field, - new BytesRef("a")); + new BytesRef("a"), + false); for (int i = 0; i < 10; i++) { tp.nextDoc(); Index: lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java (working copy) @@ -96,7 +96,7 @@ public DocsAndPositionsEnum getDocsAndPositions(IndexReader reader, BytesRef bytes, Bits liveDocs) throws IOException { - return reader.termPositionsEnum(null, fieldName, bytes); + return reader.termPositionsEnum(null, fieldName, bytes, false); } /** @@ -358,7 +358,7 @@ writer.addDocument(doc); IndexReader reader = writer.getReader(); IndexReader r = getOnlySegmentReader(reader); - DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar")); + DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"), false); int docid = disi.docID(); assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); @@ -366,7 +366,7 @@ // now reuse and check again TermsEnum te = r.terms("foo").iterator(null); assertTrue(te.seekExact(new BytesRef("bar"), true)); - disi = te.docsAndPositions(null, disi); + disi = te.docsAndPositions(null, disi, false); docid = disi.docID(); assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); Index: lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -406,7 +406,7 @@ BytesRef term2; while((term2 = termsEnum3.next()) != null) { System.out.println(" " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq()); - dpEnum = termsEnum3.docsAndPositions(null, dpEnum); + dpEnum = termsEnum3.docsAndPositions(null, dpEnum, false); if (dpEnum != null) { assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); final int freq = dpEnum.freq(); @@ -440,7 +440,7 @@ BytesRef term2; while((term2 = termsEnum3.next()) != null) { System.out.println(" " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq()); - dpEnum = termsEnum3.docsAndPositions(null, dpEnum); + dpEnum = termsEnum3.docsAndPositions(null, dpEnum, false); if (dpEnum != null) { assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); final int freq = dpEnum.freq(); @@ -630,8 +630,8 @@ assertEquals(termsEnum1.totalTermFreq(), termsEnum2.totalTermFreq()); - dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1); - dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2); + dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1, false); + dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2, false); if (dpEnum1 != null) { assertNotNull(dpEnum2); int docID1 = dpEnum1.nextDoc(); Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -21,15 +21,10 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Random; import java.util.Set; import org.apache.lucene.analysis.*; @@ -44,6 +39,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; @@ -51,7 +47,6 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; @@ -905,7 +900,8 @@ DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(s.getIndexReader(), MultiFields.getLiveDocs(s.getIndexReader()), "field", - new BytesRef("a")); + new BytesRef("a"), + false); assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, tps.freq()); @@ -970,14 +966,14 @@ Terms tpv = r.getTermVectors(0).terms("field"); TermsEnum termsEnum = tpv.iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false); assertNotNull(dpEnum); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(1, dpEnum.freq()); assertEquals(100, dpEnum.nextPosition()); assertNotNull(termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, false); assertNotNull(dpEnum); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(1, dpEnum.freq()); @@ -1640,7 +1636,7 @@ // Make sure position is still incremented when // massive term is skipped: - DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another")); + DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"), false); assertEquals(0, tps.nextDoc()); assertEquals(1, tps.freq()); assertEquals(3, tps.nextPosition()); @@ -1767,4 +1763,27 @@ w1.close(); d.close(); } + + public void testChangeIndexOptions() throws Exception { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, + new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); + + FieldType docsAndFreqs = new FieldType(TextField.TYPE_UNSTORED); + docsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + + FieldType docsOnly = new FieldType(TextField.TYPE_UNSTORED); + docsOnly.setIndexOptions(IndexOptions.DOCS_ONLY); + + Document doc = new Document(); + doc.add(new Field("field", "a b c", docsAndFreqs)); + w.addDocument(doc); + w.addDocument(doc); + + doc = new Document(); + doc.add(new Field("field", "a b c", docsOnly)); + w.addDocument(doc); + w.close(); + dir.close(); + } } Index: lucene/src/test/org/apache/lucene/index/TestTermVectorsWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestTermVectorsWriter.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestTermVectorsWriter.java (working copy) @@ -26,7 +26,6 @@ import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -69,34 +68,30 @@ // Token "" occurred once assertEquals(1, termsEnum.totalTermFreq()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(8, offsetAtt.startOffset()); - assertEquals(8, offsetAtt.endOffset()); + assertEquals(8, dpEnum.startOffset()); + assertEquals(8, dpEnum.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc()); // Token "abcd" occurred three times assertEquals(new BytesRef("abcd"), termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); assertEquals(3, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); - assertEquals(4, offsetAtt.startOffset()); - assertEquals(8, offsetAtt.endOffset()); + assertEquals(4, dpEnum.startOffset()); + assertEquals(8, dpEnum.endOffset()); dpEnum.nextPosition(); - assertEquals(8, offsetAtt.startOffset()); - assertEquals(12, offsetAtt.endOffset()); + assertEquals(8, dpEnum.startOffset()); + assertEquals(12, dpEnum.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc()); assertNull(termsEnum.next()); @@ -122,19 +117,17 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); - assertEquals(5, offsetAtt.startOffset()); - assertEquals(9, offsetAtt.endOffset()); + assertEquals(5, dpEnum.startOffset()); + assertEquals(9, dpEnum.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); @@ -159,19 +152,17 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); - assertEquals(8, offsetAtt.startOffset()); - assertEquals(12, offsetAtt.endOffset()); + assertEquals(8, dpEnum.startOffset()); + assertEquals(12, dpEnum.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); @@ -200,19 +191,17 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); - assertEquals(8, offsetAtt.startOffset()); - assertEquals(12, offsetAtt.endOffset()); + assertEquals(8, dpEnum.startOffset()); + assertEquals(12, dpEnum.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); @@ -238,19 +227,17 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); - assertEquals(9, offsetAtt.startOffset()); - assertEquals(13, offsetAtt.endOffset()); + assertEquals(9, dpEnum.startOffset()); + assertEquals(13, dpEnum.endOffset()); assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); @@ -277,32 +264,26 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); assertNotNull(termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(11, offsetAtt.startOffset()); - assertEquals(17, offsetAtt.endOffset()); + assertEquals(11, dpEnum.startOffset()); + assertEquals(17, dpEnum.endOffset()); assertNotNull(termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(18, offsetAtt.startOffset()); - assertEquals(21, offsetAtt.endOffset()); + assertEquals(18, dpEnum.startOffset()); + assertEquals(21, dpEnum.endOffset()); r.close(); dir.close(); @@ -328,24 +309,20 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertEquals(1, (int) termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(1, offsetAtt.startOffset()); - assertEquals(7, offsetAtt.endOffset()); + assertEquals(1, dpEnum.startOffset()); + assertEquals(7, dpEnum.endOffset()); assertNotNull(termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(8, offsetAtt.startOffset()); - assertEquals(11, offsetAtt.endOffset()); + assertEquals(8, dpEnum.startOffset()); + assertEquals(11, dpEnum.endOffset()); r.close(); dir.close(); @@ -375,24 +352,20 @@ IndexReader r = IndexReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); - OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true); assertEquals(1, (int) termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(4, offsetAtt.endOffset()); + assertEquals(0, dpEnum.startOffset()); + assertEquals(4, dpEnum.endOffset()); assertNotNull(termsEnum.next()); - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - assertNotNull(offsetAtt); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS); dpEnum.nextPosition(); - assertEquals(6, offsetAtt.startOffset()); - assertEquals(12, offsetAtt.endOffset()); + assertEquals(6, dpEnum.startOffset()); + assertEquals(12, dpEnum.endOffset()); r.close(); Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 1231394) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -166,7 +166,7 @@ totTF += positions[i].length; for(int j=0;j= 0) { + throw new IllegalArgumentException("this codec cannot index offsets"); + } + storePayloads = fieldInfo.storePayloads; //System.out.println(" set init blockFreqStart=" + freqStart); //System.out.println(" set init blockProxStart=" + proxStart); @@ -197,11 +201,19 @@ /** Add a new position & payload */ @Override - public void addPosition(int position, BytesRef payload) throws IOException { + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { //if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions; assert proxOut != null; + // TODO: we can be smarter about endOffset... often + // endOffset-startOffset will be constant or near + // constant for all docs (eg if the term wasn't stemmed + // then this will usually be the utf16 length of the + // term); would be nice to write that length once up + // front and then not encode endOffset for each + // position.. + final int delta = position - lastPosition; assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) Index: lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java (working copy) @@ -80,6 +80,8 @@ } else { throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format + " (resource: " + input + ")"); } + } else if (format <= Lucene40FieldInfosWriter.FORMAT_OFFSETS_IN_POSTINGS && (bits & Lucene40FieldInfosWriter.STORE_OFFSETS_IN_POSTINGS) != 0) { + indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } else { indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; } Index: lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java (working copy) @@ -41,12 +41,15 @@ static final int FORMAT_OMIT_POSITIONS = -3; // per-field codec support, records index values for fields static final int FORMAT_FLEX = -4; + // add offsets to postings + static final int FORMAT_OFFSETS_IN_POSTINGS = -5; // whenever you add a new format, make it 1 smaller (negative version logic)! - static final int FORMAT_CURRENT = FORMAT_FLEX; + static final int FORMAT_CURRENT = FORMAT_OFFSETS_IN_POSTINGS; static final byte IS_INDEXED = 0x1; static final byte STORE_TERMVECTOR = 0x2; + static final byte STORE_OFFSETS_IN_POSTINGS = 0x4; static final byte OMIT_NORMS = 0x10; static final byte STORE_PAYLOADS = 0x20; static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40; @@ -68,6 +71,8 @@ if (fi.storePayloads) bits |= STORE_PAYLOADS; if (fi.indexOptions == IndexOptions.DOCS_ONLY) { bits |= OMIT_TERM_FREQ_AND_POSITIONS; + } else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { + bits |= STORE_OFFSETS_IN_POSTINGS; } else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS) { bits |= OMIT_POSITIONS; } Index: lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java (working copy) @@ -241,11 +241,15 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, + DocsAndPositionsEnum reuse, boolean needsOffsets) + throws IOException { + + if (needsOffsets) { + // TODO: once we index offsets into postings fix this! return null; } - + // TODO: refactor if (fieldInfo.storePayloads) { SegmentDocsAndPositionsAndPayloadsEnum docsEnum; @@ -366,7 +370,7 @@ start = count; // buffer is consumed - return doc = skipTo(target, liveDocs); + return doc = skipTo(target); } private final int binarySearch(int hi, int low, int target, int[] docs) { @@ -448,7 +452,7 @@ } - private final int skipTo(int target, Bits liveDocs) throws IOException { + private final int skipTo(int target) throws IOException { if ((target - skipInterval) >= accum && limit >= skipMinimum) { // There are enough docs in the posting to have @@ -841,6 +845,16 @@ return position; } + @Override + public int startOffset() throws IOException { + return -1; + } + + @Override + public int endOffset() throws IOException { + return -1; + } + /** Returns the payload at this position, or null if no * payload was indexed. */ @Override @@ -1074,6 +1088,16 @@ return position; } + @Override + public int startOffset() throws IOException { + return -1; + } + + @Override + public int endOffset() throws IOException { + return -1; + } + /** Returns the payload at this position, or null if no * payload was indexed. */ @Override Index: lucene/src/java/org/apache/lucene/codecs/PostingsReaderBase.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/PostingsReaderBase.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/PostingsReaderBase.java (working copy) @@ -55,7 +55,8 @@ /** Must fully consume state, since after this call that * TermState may be reused. */ - public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; + public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse, + boolean needsOffsets) throws IOException; public abstract void close() throws IOException; Index: lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (working copy) @@ -131,7 +131,7 @@ } @Override - public void addPosition(int pos, BytesRef payload) throws IOException { + public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException { assert payload == null || field.storePayloads; if (VERBOSE) System.out.println(" addPos pos=" + pos + " payload=" + payload); @@ -249,6 +249,9 @@ return new FieldsConsumer() { @Override public TermsConsumer addField(FieldInfo field) { + if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { + throw new IllegalArgumentException("this codec cannot index offsets"); + } if (VERBOSE) System.out.println("\naddField field=" + field.name); return new TermsWriter(out, field); } @@ -501,6 +504,16 @@ } @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override public BytesRef getPayload() { payloadRetrieved = true; return payload; @@ -618,10 +631,16 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + + if (needsOffsets) { + // Not until we can index offsets... return null; } + + if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + return null; + } decodeMetaData(); FSTDocsAndPositionsEnum docsAndPositionsEnum; if (reuse == null || !(reuse instanceof FSTDocsAndPositionsEnum)) { Index: lucene/src/java/org/apache/lucene/codecs/TermsConsumer.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/TermsConsumer.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/TermsConsumer.java (working copy) @@ -119,8 +119,41 @@ } } } + } else if (mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (postingsEnum == null) { + postingsEnum = new MappingMultiDocsAndPositionsEnum(); + } + postingsEnum.setMergeState(mergeState); + MultiDocsAndPositionsEnum postingsEnumIn = null; + while((term = termsEnum.next()) != null) { + // We can pass null for liveDocs, because the + // mapping enum will skip the non-live docs: + postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, false); + assert postingsEnumIn != null; + postingsEnum.reset(postingsEnumIn); + // set PayloadProcessor + if (mergeState.payloadProcessorProvider != null) { + for (int i = 0; i < mergeState.readers.size(); i++) { + if (mergeState.dirPayloadProcessor[i] != null) { + mergeState.currentPayloadProcessor[i] = mergeState.dirPayloadProcessor[i].getProcessor(mergeState.fieldInfo.name, term); + } + } + } + final PostingsConsumer postingsConsumer = startTerm(term); + final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum, visitedDocs); + if (stats.docFreq > 0) { + finishTerm(term, stats); + sumTotalTermFreq += stats.totalTermFreq; + sumDFsinceLastAbortCheck += stats.docFreq; + sumDocFreq += stats.docFreq; + if (sumDFsinceLastAbortCheck > 60000) { + mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); + sumDFsinceLastAbortCheck = 0; + } + } + } } else { - assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (postingsEnum == null) { postingsEnum = new MappingMultiDocsAndPositionsEnum(); } @@ -129,7 +162,7 @@ while((term = termsEnum.next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: - postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn); + postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, true); assert postingsEnumIn != null; postingsEnum.reset(postingsEnumIn); // set PayloadProcessor @@ -154,7 +187,6 @@ } } } - finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); } } Index: lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xFields.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xFields.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xFields.java (working copy) @@ -966,7 +966,12 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + if (needsOffsets) { + // Pre-4.0 indices never have offsets: + return null; + } + PreDocsAndPositionsEnum docsPosEnum; if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; @@ -1082,6 +1087,16 @@ } @Override + public int startOffset() throws IOException { + return -1; + } + + @Override + public int endOffset() throws IOException { + return -1; + } + + @Override public boolean hasPayload() { assert docID != NO_MORE_DOCS; return pos.isPayloadAvailable(); Index: lucene/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java (working copy) @@ -294,7 +294,18 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, + DocsAndPositionsEnum reuse, boolean needsOffsets) + throws IOException { + + if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + return null; + } + + if (needsOffsets) { + return null; + } + assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; final SepTermState termState = (SepTermState) _termState; SepDocsAndPositionsEnum postingsEnum; @@ -713,6 +724,16 @@ return position; } + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + private BytesRef payload; @Override Index: lucene/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java (working copy) @@ -188,6 +188,9 @@ public void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; this.indexOptions = fieldInfo.indexOptions; + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { + throw new IllegalArgumentException("this codec cannot index offsets"); + } skipListWriter.setIndexOptions(indexOptions); storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.storePayloads; } @@ -222,7 +225,7 @@ /** Add a new position & payload */ @Override - public void addPosition(int position, BytesRef payload) throws IOException { + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; final int delta = position - lastPosition; Index: lucene/src/java/org/apache/lucene/codecs/BlockTermsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/BlockTermsReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/BlockTermsReader.java (working copy) @@ -697,16 +697,20 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - //System.out.println("BTR.d&p this=" + this); - decodeMetaData(); - if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // Positions were not indexed: return null; - } else { - DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse); - //System.out.println(" return d&pe=" + dpe); - return dpe; } + + if (needsOffsets && + fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) { + // Offsets were not indexed: + return null; + } + + decodeMetaData(); + return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, needsOffsets); } @Override Index: lucene/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java (working copy) @@ -115,6 +115,9 @@ @Override public void setField(FieldInfo fieldInfo) { this.indexOptions = fieldInfo.indexOptions; + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { + throw new IllegalArgumentException("this codec cannot index offsets: " + indexOptions); + } if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions); storePayloads = fieldInfo.storePayloads; wrappedPostingsWriter.setField(fieldInfo); @@ -165,7 +168,7 @@ } @Override - public void addPosition(int position, BytesRef payload) throws IOException { + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { if (DEBUG) System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes")); if (pendingCount == pending.length) { @@ -175,7 +178,7 @@ if (pendingCount == -1) { // We've already seen too many docs for this term -- // just forward to our fallback writer - wrappedPostingsWriter.addPosition(position, payload); + wrappedPostingsWriter.addPosition(position, payload, -1, -1); } else { // buffer up final Position pos = pending[pendingCount++]; @@ -360,7 +363,7 @@ wrappedPostingsWriter.startTerm(); // Flush all buffered docs - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { Position doc = null; for(Position pos : pending) { if (doc == null) { @@ -376,7 +379,7 @@ wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq); } if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos); - wrappedPostingsWriter.addPosition(pos.pos, pos.payload); + wrappedPostingsWriter.addPosition(pos.pos, pos.payload, -1, -1); } //wrappedPostingsWriter.finishDoc(); } else { Index: lucene/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java (working copy) @@ -215,10 +215,8 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - return null; - } + public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse, + boolean needsOffsets) throws IOException { //System.out.println("D&P: field=" + field.name); final PulsingTermState termState = (PulsingTermState) _termState; @@ -245,11 +243,12 @@ return postings.reset(liveDocs, termState); } else { if (reuse instanceof PulsingDocsAndPositionsEnum) { - DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse)); + DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse), + needsOffsets); setOther(wrapped, reuse); // wrapped.other = reuse return wrapped; } else { - return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse); + return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse, needsOffsets); } } } @@ -486,6 +485,16 @@ return position; } + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + private void skipPositions() throws IOException { while(posPending != 0) { nextPosition(); Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (working copy) @@ -50,13 +50,15 @@ private final IndexInput in; private final FieldInfos fieldInfos; - final static BytesRef END = SimpleTextFieldsWriter.END; - final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD; - final static BytesRef TERM = SimpleTextFieldsWriter.TERM; - final static BytesRef DOC = SimpleTextFieldsWriter.DOC; - final static BytesRef FREQ = SimpleTextFieldsWriter.FREQ; - final static BytesRef POS = SimpleTextFieldsWriter.POS; - final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD; + final static BytesRef END = SimpleTextFieldsWriter.END; + final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD; + final static BytesRef TERM = SimpleTextFieldsWriter.TERM; + final static BytesRef DOC = SimpleTextFieldsWriter.DOC; + final static BytesRef FREQ = SimpleTextFieldsWriter.FREQ; + final static BytesRef POS = SimpleTextFieldsWriter.POS; + final static BytesRef START_OFFSET = SimpleTextFieldsWriter.START_OFFSET; + final static BytesRef END_OFFSET = SimpleTextFieldsWriter.END_OFFSET; + final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD; public SimpleTextFieldsReader(SegmentReadState state) throws IOException { in = state.dir.openInput(SimpleTextPostingsFormat.getPostingsFileName(state.segmentInfo.name, state.segmentSuffix), state.context); @@ -204,18 +206,26 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // Positions were not indexed return null; } + if (needsOffsets && + indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) { + // Offsets were not indexed + return null; + } + SimpleTextDocsAndPositionsEnum docsAndPositionsEnum; if (reuse != null && reuse instanceof SimpleTextDocsAndPositionsEnum && ((SimpleTextDocsAndPositionsEnum) reuse).canReuse(SimpleTextFieldsReader.this.in)) { docsAndPositionsEnum = (SimpleTextDocsAndPositionsEnum) reuse; } else { docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum(); } - return docsAndPositionsEnum.reset(docsStart, liveDocs); + return docsAndPositionsEnum.reset(docsStart, liveDocs, indexOptions); } @Override @@ -289,6 +299,10 @@ termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; + } else if (StringHelper.startsWith(scratch, START_OFFSET)) { + // skip + } else if (StringHelper.startsWith(scratch, END_OFFSET)) { + // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { @@ -325,6 +339,10 @@ private final CharsRef scratchUTF16_2 = new CharsRef(10); private BytesRef payload; private long nextDocStart; + private boolean readOffsets; + private boolean readPositions; + private int startOffset; + private int endOffset; public SimpleTextDocsAndPositionsEnum() { this.inStart = SimpleTextFieldsReader.this.in; @@ -335,10 +353,13 @@ return in == inStart; } - public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs) { + public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs, IndexOptions indexOptions) { this.liveDocs = liveDocs; nextDocStart = fp; docID = -1; + readPositions = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || + indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; + readOffsets = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; return this; } @@ -360,6 +381,7 @@ while(true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); + //System.out.println("NEXT DOC: " + scratch.utf8ToString()); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; @@ -376,6 +398,10 @@ posStart = in.getFilePointer(); } else if (StringHelper.startsWith(scratch, POS)) { // skip + } else if (StringHelper.startsWith(scratch, START_OFFSET)) { + // skip + } else if (StringHelper.startsWith(scratch, END_OFFSET)) { + // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { @@ -399,10 +425,27 @@ @Override public int nextPosition() throws IOException { - SimpleTextUtil.readLine(in, scratch); - assert StringHelper.startsWith(scratch, POS): "got line=" + scratch.utf8ToString(); - UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2); - final int pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); + final int pos; + if (readPositions) { + SimpleTextUtil.readLine(in, scratch); + assert StringHelper.startsWith(scratch, POS): "got line=" + scratch.utf8ToString(); + UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2); + pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); + } else { + pos = -1; + } + + if (readOffsets) { + SimpleTextUtil.readLine(in, scratch); + assert StringHelper.startsWith(scratch, START_OFFSET): "got line=" + scratch.utf8ToString(); + UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+START_OFFSET.length, scratch.length-START_OFFSET.length, scratchUTF16_2); + startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); + SimpleTextUtil.readLine(in, scratch); + assert StringHelper.startsWith(scratch, END_OFFSET): "got line=" + scratch.utf8ToString(); + UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+END_OFFSET.length, scratch.length-END_OFFSET.length, scratchUTF16_2); + endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); + } + final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { @@ -421,6 +464,16 @@ } @Override + public int startOffset() throws IOException { + return startOffset; + } + + @Override + public int endOffset() throws IOException { + return endOffset; + } + + @Override public BytesRef getPayload() { // Some tests rely on only being able to retrieve the // payload once Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java (working copy) @@ -35,13 +35,15 @@ private final IndexOutput out; private final BytesRef scratch = new BytesRef(10); - final static BytesRef END = new BytesRef("END"); - final static BytesRef FIELD = new BytesRef("field "); - final static BytesRef TERM = new BytesRef(" term "); - final static BytesRef DOC = new BytesRef(" doc "); - final static BytesRef FREQ = new BytesRef(" freq "); - final static BytesRef POS = new BytesRef(" pos "); - final static BytesRef PAYLOAD = new BytesRef(" payload "); + final static BytesRef END = new BytesRef("END"); + final static BytesRef FIELD = new BytesRef("field "); + final static BytesRef TERM = new BytesRef(" term "); + final static BytesRef DOC = new BytesRef(" doc "); + final static BytesRef FREQ = new BytesRef(" freq "); + final static BytesRef POS = new BytesRef(" pos "); + final static BytesRef START_OFFSET = new BytesRef(" startOffset "); + final static BytesRef END_OFFSET = new BytesRef(" endOffset "); + final static BytesRef PAYLOAD = new BytesRef(" payload "); public SimpleTextFieldsWriter(SegmentWriteState state) throws IOException { final String fileName = SimpleTextPostingsFormat.getPostingsFileName(state.segmentName, state.segmentSuffix); @@ -97,10 +99,19 @@ private class SimpleTextPostingsWriter extends PostingsConsumer { private BytesRef term; private boolean wroteTerm; - private IndexOptions indexOptions; + private final IndexOptions indexOptions; + private final boolean writePositions; + private final boolean writeOffsets; + // for assert: + private int lastEndOffset = -1; + public SimpleTextPostingsWriter(FieldInfo field) { this.indexOptions = field.indexOptions; + writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + //System.out.println("writeOffsets=" + writeOffsets); + //System.out.println("writePos=" + writePositions); } @Override @@ -121,10 +132,10 @@ write(Integer.toString(termDocFreq)); newline(); } + + lastEndOffset = -1; } - - public PostingsConsumer reset(BytesRef term) { this.term = term; wroteTerm = false; @@ -132,10 +143,25 @@ } @Override - public void addPosition(int position, BytesRef payload) throws IOException { - write(POS); - write(Integer.toString(position)); - newline(); + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + if (writePositions) { + write(POS); + write(Integer.toString(position)); + newline(); + } + + if (writeOffsets) { + assert endOffset >= startOffset; + assert startOffset >= lastEndOffset: "startOffset=" + startOffset + " lastEndOffset=" + lastEndOffset; + lastEndOffset = endOffset; + write(START_OFFSET); + write(Integer.toString(startOffset)); + newline(); + write(END_OFFSET); + write(Integer.toString(endOffset)); + newline(); + } + if (payload != null && payload.length > 0) { assert payload.length != 0; write(PAYLOAD); Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsFormat.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsFormat.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsFormat.java (working copy) @@ -38,7 +38,7 @@ @Override public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException { - return new SimpleTextTermVectorsReader(directory, segmentInfo, fieldInfos, context); + return new SimpleTextTermVectorsReader(directory, segmentInfo, context); } @Override Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java (working copy) @@ -26,11 +26,9 @@ import java.util.SortedMap; import java.util.TreeMap; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; @@ -63,7 +61,7 @@ private BytesRef scratch = new BytesRef(); private CharsRef scratchUTF16 = new CharsRef(); - public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, FieldInfos fieldInfos, IOContext context) throws IOException { + public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, IOContext context) throws IOException { boolean success = false; try { in = directory.openInput(IndexFileNames.segmentFileName(si.name, "", VECTORS_EXTENSION), context); @@ -114,7 +112,8 @@ for (int i = 0; i < numFields; i++) { readLine(); assert StringHelper.startsWith(scratch, FIELD); - int fieldNumber = parseIntAt(FIELD.length); + // skip fieldNumber: + parseIntAt(FIELD.length); readLine(); assert StringHelper.startsWith(scratch, FIELDNAME); @@ -373,13 +372,16 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { SimpleTVPostings postings = current.getValue(); if (postings.positions == null && postings.startOffsets == null) { return null; } + if (needsOffsets && (postings.startOffsets == null || postings.endOffsets == null)) { + return null; + } // TODO: reuse - SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum(postings.startOffsets != null); + SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum(); e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets); return e; } @@ -436,7 +438,6 @@ } private static class SimpleTVDocsAndPositionsEnum extends DocsAndPositionsEnum { - private final OffsetAttribute offsetAtt; private boolean didNext; private int doc = -1; private int nextPos; @@ -445,18 +446,6 @@ private int[] startOffsets; private int[] endOffsets; - public SimpleTVDocsAndPositionsEnum(boolean storeOffsets) { - if (storeOffsets) { - offsetAtt = attributes().addAttribute(OffsetAttribute.class); - } else { - offsetAtt = null; - } - } - - public boolean canReuse(boolean storeOffsets) { - return storeOffsets == (offsetAtt != null); - } - @Override public int freq() { if (positions != null) { @@ -495,7 +484,6 @@ this.liveDocs = liveDocs; this.positions = positions; this.startOffsets = startOffsets; - assert (offsetAtt != null) == (startOffsets != null); this.endOffsets = endOffsets; this.doc = -1; didNext = false; @@ -516,11 +504,6 @@ public int nextPosition() { assert (positions != null && nextPos < positions.length) || startOffsets != null && nextPos < startOffsets.length; - - if (startOffsets != null) { - offsetAtt.setOffset(startOffsets[nextPos], - endOffsets[nextPos]); - } if (positions != null) { return positions[nextPos++]; } else { @@ -528,5 +511,15 @@ return -1; } } + + @Override + public int startOffset() { + return startOffsets[nextPos-1]; + } + + @Override + public int endOffset() { + return endOffsets[nextPos-1]; + } } } Index: lucene/src/java/org/apache/lucene/codecs/PostingsConsumer.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/PostingsConsumer.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/codecs/PostingsConsumer.java (working copy) @@ -44,12 +44,12 @@ int docBase; } - /** Add a new position & payload. A null payload means no - * payload; a non-null payload with zero length also - * means no payload. Caller may reuse the {@link - * BytesRef} for the payload between calls (method must - * fully consume the payload). */ - public abstract void addPosition(int position, BytesRef payload) throws IOException; + /** Add a new position & payload, and start/end offset. A + * null payload means no payload; a non-null payload with + * zero length also means no payload. Caller may reuse + * the {@link BytesRef} for the payload between calls + * (method must fully consume the payload). */ + public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException; /** Called when we are done adding positions & payloads * for each doc. Not called when the field omits term @@ -88,7 +88,32 @@ df++; totTF += freq; } + } else if (mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings; + while(true) { + final int doc = postingsEnum.nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + visitedDocs.set(doc); + final int freq = postingsEnum.freq(); + this.startDoc(doc, freq); + totTF += freq; + for(int i=0;i 0; - docsAndPositionsEnum = termsEnum.docsAndPositions(null, null); - if (docsAndPositionsEnum != null) { - // has positions - positions = true; - if (docsAndPositionsEnum.attributes().hasAttribute(OffsetAttribute.class)) { - offsetAtt = docsAndPositionsEnum.attributes().getAttribute(OffsetAttribute.class); - } else { - offsetAtt = null; - } + final int freq = (int) termsEnum.totalTermFreq(); + + if (startedField) { + startTerm(termsEnum.term(), freq); + } + + // TODO: we need a "query" API where we can ask (via + // flex API) what this term was indexed with... + // Both positions & offsets: + docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true); + final boolean hasOffsets; + boolean hasPositions = false; + if (docsAndPositionsEnum == null) { + // Fallback: no offsets + docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false); + hasOffsets = false; } else { - positions = false; - offsetAtt = null; + hasOffsets = true; } - } else { - // no terms in this field (hmm why is field present - // then...?) - assert numTerms == 0; - positions = false; - offsetAtt = null; - } - - startField(fieldInfo, numTerms, positions, offsetAtt != null); - int termCount = 1; - - // NOTE: we already .next()'d the TermsEnum above, to - // peek @ first term to see if positions/offsets are - // present - while(true) { - final int freq = (int) termsEnum.totalTermFreq(); - startTerm(termsEnum.term(), freq); - - if (positions || offsetAtt != null) { - DocsAndPositionsEnum dp = termsEnum.docsAndPositions(null, docsAndPositionsEnum); - // TODO: add startOffset()/endOffset() to d&pEnum... this is insanity - if (dp != docsAndPositionsEnum) { - // producer didnt reuse, must re-pull attributes - if (offsetAtt != null) { - assert dp.attributes().hasAttribute(OffsetAttribute.class); - offsetAtt = dp.attributes().getAttribute(OffsetAttribute.class); - } - } - docsAndPositionsEnum = dp; + if (docsAndPositionsEnum != null) { final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocsEnum.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq; for(int posUpto=0; posUpto 0; + hasPositions = pos != -1; + startField(fieldInfo, numTerms, hasPositions, hasOffsets); + startTerm(termsEnum.term(), freq); + startedField = true; + } + final int startOffset; + final int endOffset; + if (hasOffsets) { + startOffset = docsAndPositionsEnum.startOffset(); + endOffset = docsAndPositionsEnum.endOffset(); + assert startOffset != -1; + assert endOffset != -1; + } else { + startOffset = -1; + endOffset = -1; + } + assert !hasPositions || pos >= 0; addPosition(pos, startOffset, endOffset); } + } else { + if (!startedField) { + assert numTerms > 0; + startField(fieldInfo, numTerms, hasPositions, hasOffsets); + startTerm(termsEnum.term(), freq); + startedField = true; + } } - - if (termsEnum.next() == null) { - assert termCount == numTerms; - break; - } - termCount++; } + assert termCount == numTerms; } } } Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -272,8 +272,8 @@ @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, - DocsAndPositionsEnum reuse) throws IOException { - return actualEnum.docsAndPositions(liveDocs, reuse); + DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + return actualEnum.docsAndPositions(liveDocs, reuse, needsOffsets); } @Override Index: lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java (working copy) @@ -120,7 +120,7 @@ final TermsEnum termsEnum = context.reader.terms(term.field()).iterator(null); termsEnum.seekExact(term.bytes(), state); - final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null); + final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null, false); if (postings != null) { return new TermSpans(postings, term); Index: lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -225,7 +225,7 @@ return null; } termsEnum.seekExact(term.bytes(), termState); - postingsEnum = termsEnum.docsAndPositions(liveDocs, null); + postingsEnum = termsEnum.docsAndPositions(liveDocs, null, false); if (postingsEnum == null) { // term does exist, but has no positions @@ -475,7 +475,7 @@ continue; } termsEnum.seekExact(term.bytes(), termState); - DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null); + DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, false); if (postings == null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")"); @@ -528,6 +528,16 @@ } @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override public BytesRef getPayload() { throw new UnsupportedOperationException(); } Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -239,7 +239,7 @@ return null; } te.seekExact(t.bytes(), state); - DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null); + DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null, false); // PhraseQuery on a field that did not index // positions. Index: lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -1043,7 +1043,7 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { throw new UnsupportedOperationException(); } Index: lucene/src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocInverterPerField.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -73,8 +73,9 @@ // tokenized. if (field.fieldType().indexed() && doInvert) { - if (i > 0) + if (i > 0) { fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name); + } final TokenStream stream = field.tokenStream(docState.analyzer); // reset the TokenStream to the first token Index: lucene/src/java/org/apache/lucene/index/DocTermOrds.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocTermOrds.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/DocTermOrds.java (working copy) @@ -655,8 +655,8 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - return termsEnum.docsAndPositions(liveDocs, reuse); + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + return termsEnum.docsAndPositions(liveDocs, reuse, needsOffsets); } @Override Index: lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java (working copy) @@ -38,7 +38,7 @@ boolean doVectorOffsets; int maxNumPostings; - OffsetAttribute offsetAttribute = null; + OffsetAttribute offsetAttribute; public TermVectorsConsumerPerField(TermsHashPerField termsHashPerField, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -760,7 +760,7 @@ docs = termsEnum.docs(liveDocs, docs, false); docsAndFreqs = termsEnum.docs(liveDocs, docsAndFreqs, true); - postings = termsEnum.docsAndPositions(liveDocs, postings); + postings = termsEnum.docsAndPositions(liveDocs, postings, false); if (hasOrd) { long ord = -1; @@ -890,7 +890,7 @@ if (hasPositions) { for(int idx=0;idx<7;idx++) { final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8); - postings = termsEnum.docsAndPositions(liveDocs, postings); + postings = termsEnum.docsAndPositions(liveDocs, postings, false); final int docID = postings.advance(skipDocID); if (docID == DocsEnum.NO_MORE_DOCS) { break; @@ -1256,6 +1256,10 @@ private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) { final Status.TermVectorStatus status = new Status.TermVectorStatus(); + // TODO: in theory we could test that term vectors have + // same terms/pos/offsets as the postings, but it'd be + // very slow... + TermsEnum termsEnum = null; try { if (infoStream != null) { @@ -1264,7 +1268,6 @@ // TODO: maybe we can factor out testTermIndex and reuse here? DocsEnum docs = null; - DocsEnum docsAndFreqs = null; DocsAndPositionsEnum postings = null; final Bits liveDocs = reader.getLiveDocs(); for (int j = 0; j < info.docCount; ++j) { @@ -1308,26 +1311,58 @@ throw new RuntimeException("totalTermFreq: " + totalTermFreq + " is out of bounds"); } - postings = termsEnum.docsAndPositions(null, postings); + final boolean hasPositions; + final boolean hasOffsets; + final boolean hasFreqs; + + // TODO: really we need a reflection/query + // API so we can just ask what was indexed + // instead of "probing"... + + // Try offsets: + postings = termsEnum.docsAndPositions(null, postings, true); if (postings == null) { - docsAndFreqs = termsEnum.docs(null, docsAndFreqs, true); - if (docsAndFreqs == null) { - docs = termsEnum.docs(null, docs, false); + hasOffsets = false; + // Try only positions: + postings = termsEnum.docsAndPositions(null, postings, false); + if (postings == null) { + hasPositions = false; + // Try docIDs & freqs: + docs = termsEnum.docs(null, docs, true); + if (docs == null) { + // OK, only docIDs: + hasFreqs = false; + docs = termsEnum.docs(null, docs, false); + } else { + hasFreqs = true; + } } else { - docs = docsAndFreqs; + hasPositions = true; + hasFreqs = true; } } else { - docs = docsAndFreqs = postings; + hasOffsets = true; + hasPositions = true; + hasFreqs = true; } - final int doc = docs.nextDoc(); + final DocsEnum docs2; + if (hasPositions || hasOffsets) { + assert postings != null; + docs2 = postings; + } else { + assert docs != null; + docs2 = docs; + } + + final int doc = docs2.nextDoc(); if (doc != 0) { throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc); } - if (docsAndFreqs != null) { - final int tf = docsAndFreqs.freq(); + if (hasFreqs) { + final int tf = docs2.freq(); if (tf <= 0) { throw new RuntimeException("vector freq " + tf + " is out of bounds"); } @@ -1336,24 +1371,40 @@ } tfvComputedSumTotalTermFreq += tf; - if (postings != null) { + if (hasPositions || hasOffsets) { int lastPosition = -1; + //int lastStartOffset = -1; for (int i = 0; i < tf; i++) { int pos = postings.nextPosition(); - if (pos != -1 && pos < 0) { - throw new RuntimeException("vector position " + pos + " is out of bounds"); - } + if (hasPositions) { + if (pos != -1 && pos < 0) { + throw new RuntimeException("vector position " + pos + " is out of bounds"); + } + if (pos < lastPosition) { + throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition); + } - if (pos < lastPosition) { - throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition); + lastPosition = pos; } - - lastPosition = pos; + if (hasOffsets) { + int startOffset = postings.startOffset(); + int endOffset = postings.endOffset(); + // TODO: these may be too anal...? + /* + if (endOffset < startOffset) { + throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset); + } + if (startOffset < lastStartOffset) { + throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset); + } + lastStartOffset = startOffset; + */ + } } } } - if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + if (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { throw new RuntimeException("vector for doc " + j + " references multiple documents!"); } } Index: lucene/src/java/org/apache/lucene/index/DocsAndPositionsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocsAndPositionsEnum.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/DocsAndPositionsEnum.java (working copy) @@ -26,9 +26,18 @@ /** Returns the next position. You should only call this * up to {@link DocsEnum#freq()} times else - * the behavior is not defined. */ + * the behavior is not defined. If positions were not + * indexed this will return -1; this only happens if + * offsets were indexed and you passed needsOffset=true + * when pulling the enum. */ public abstract int nextPosition() throws IOException; + /** Returns start offset for the current position. */ + public abstract int startOffset() throws IOException; + + /** Returns end offset for the current position. */ + public abstract int endOffset() throws IOException; + /** Returns the payload at this position, or null if no * payload was indexed. Only call this once per * position. */ Index: lucene/src/java/org/apache/lucene/index/FilteredTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FilteredTermsEnum.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/FilteredTermsEnum.java (working copy) @@ -171,8 +171,8 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) throws IOException { - return tenum.docsAndPositions(bits, reuse); + public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + return tenum.docsAndPositions(bits, reuse, needsOffsets); } /** This enum does not support seeking! Index: lucene/src/java/org/apache/lucene/index/FilterIndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (working copy) @@ -176,8 +176,8 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - return in.docsAndPositions(liveDocs, reuse); + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { + return in.docsAndPositions(liveDocs, reuse, needsOffsets); } @Override @@ -259,6 +259,16 @@ } @Override + public int startOffset() throws IOException { + return in.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return in.endOffset(); + } + + @Override public BytesRef getPayload() throws IOException { return in.getPayload(); } Index: lucene/src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermsHashPerField.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -293,7 +293,7 @@ @Override public int[] init() { - if(perField.postingsArray == null) { + if (perField.postingsArray == null) { perField.postingsArray = perField.consumer.createPostingsArray(2); bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting()); } @@ -305,8 +305,7 @@ ParallelPostingsArray postingsArray = perField.postingsArray; final int oldSize = perField.postingsArray.size; postingsArray = perField.postingsArray = postingsArray.grow(); - bytesUsed - .addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize))); + bytesUsed.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize))); return postingsArray.textStarts; } Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy) @@ -160,12 +160,13 @@ /** Get {@link DocsAndPositionsEnum} for the current term. * Do not call this when the enum is unpositioned. - * This method will only return null if positions were - * not indexed into the postings by this codec. + * This method will only return null if needsOffsets is + * true but offsets were not indexed. * @param liveDocs unset bits are documents that should not * be returned - * @param reuse pass a prior DocsAndPositionsEnum for possible reuse */ - public abstract DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException; + * @param reuse pass a prior DocsAndPositionsEnum for possible reuse + * @param needsOffsets true if offsets are required */ + public abstract DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException; /** * Expert: Returns the TermsEnums internal state to position the TermsEnum @@ -238,7 +239,7 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { throw new IllegalStateException("this method should never be called"); } Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy) @@ -21,6 +21,7 @@ import java.util.Comparator; import java.util.Map; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsConsumer; @@ -43,7 +44,11 @@ final DocumentsWriterPerThread.DocState docState; final FieldInvertState fieldState; IndexOptions indexOptions; + private boolean writeFreq; + private boolean writeProx; + private boolean writeOffsets; PayloadAttribute payloadAttribute; + OffsetAttribute offsetAttribute; public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; @@ -51,15 +56,16 @@ this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; - indexOptions = fieldInfo.indexOptions; + setIndexOptions(fieldInfo.indexOptions); } @Override int getStreamCount() { - if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) + if (!writeProx) { return 1; - else + } else { return 2; + } } @Override @@ -74,13 +80,21 @@ return fieldInfo.name.compareTo(other.fieldInfo.name); } + // Called after flush void reset() { // Record, up front, whether our in-RAM format will be // with or without term freqs: - indexOptions = fieldInfo.indexOptions; + setIndexOptions(fieldInfo.indexOptions); payloadAttribute = null; } + private void setIndexOptions(IndexOptions indexOptions) { + this.indexOptions = indexOptions; + writeFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + writeProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + @Override boolean start(IndexableField[] fields, int count) { for(int i=0;i postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID; + // Term not yet seen in the current doc but previously + // seen in other doc(s) since the last flush - // Now that we know doc freq for previous doc, - // write it & lastDocCode - if (1 == postings.docFreqs[termID]) - termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1); - else { - termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); - termsHashPerField.writeVInt(0, postings.docFreqs[termID]); + // Now that we know doc freq for previous doc, + // write it & lastDocCode + if (1 == postings.docFreqs[termID]) { + termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1); + } else { + termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); + termsHashPerField.writeVInt(0, postings.docFreqs[termID]); + } + postings.docFreqs[termID] = 1; + fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); + postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; + postings.lastDocIDs[termID] = docState.docID; + if (writeProx) { + writeProx(termID, fieldState.position); + if (writeOffsets) { + writeOffsets(termID, fieldState.offset); } - postings.docFreqs[termID] = 1; - fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); - postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; - postings.lastDocIDs[termID] = docState.docID; - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - writeProx(termID, fieldState.position); - } - fieldState.uniqueTermCount++; } else { - fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]); - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - writeProx(termID, fieldState.position-postings.lastPositions[termID]); - } + assert !writeOffsets; } + fieldState.uniqueTermCount++; + } else { + fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]); + if (writeProx) { + writeProx(termID, fieldState.position-postings.lastPositions[termID]); + } + if (writeOffsets) { + writeOffsets(termID, postings.lastOffsets[termID]); + } } } @Override ParallelPostingsArray createPostingsArray(int size) { - return new FreqProxPostingsArray(size); + return new FreqProxPostingsArray(size, writeFreq, writeProx, writeOffsets); } static final class FreqProxPostingsArray extends ParallelPostingsArray { - public FreqProxPostingsArray(int size) { + public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) { super(size); - docFreqs = new int[size]; + if (writeFreqs) { + docFreqs = new int[size]; + } lastDocIDs = new int[size]; lastDocCodes = new int[size]; - lastPositions = new int[size]; + if (writeProx) { + lastPositions = new int[size]; + if (writeOffsets) { + lastOffsets = new int[size]; + } + } else { + assert !writeOffsets; + } + //System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets); } int docFreqs[]; // # times this term occurs in the current doc int lastDocIDs[]; // Last docID where this term occurred int lastDocCodes[]; // Code for prior doc int lastPositions[]; // Last position where this term occurred + int lastOffsets[]; // Last endOffset where this term occurred @Override ParallelPostingsArray newInstance(int size) { - return new FreqProxPostingsArray(size); + return new FreqProxPostingsArray(size, docFreqs != null, lastPositions != null, lastOffsets != null); } @Override @@ -221,15 +277,36 @@ super.copyTo(toArray, numToCopy); - System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy); System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy); System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy); - System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy); + if (lastPositions != null) { + assert to.lastPositions != null; + System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy); + } + if (lastOffsets != null) { + assert to.lastOffsets != null; + System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy); + } + if (docFreqs != null) { + assert to.docFreqs != null; + System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy); + } } @Override int bytesPerPosting() { - return ParallelPostingsArray.BYTES_PER_POSTING + 4 * RamUsageEstimator.NUM_BYTES_INT; + int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT; + if (lastPositions != null) { + bytes += RamUsageEstimator.NUM_BYTES_INT; + } + if (lastOffsets != null) { + bytes += RamUsageEstimator.NUM_BYTES_INT; + } + if (docFreqs != null) { + bytes += RamUsageEstimator.NUM_BYTES_INT; + } + + return bytes; } } @@ -246,8 +323,33 @@ final TermsConsumer termsConsumer = consumer.addField(fieldInfo); final Comparator termComp = termsConsumer.getComparator(); + // CONFUSING: this.indexOptions holds the index options + // that were current when we first saw this field. But + // it's possible this has changed, eg when other + // documents are indexed that cause a "downgrade" of the + // IndexOptions. So we must decode the in-RAM buffer + // according to this.indexOptions, but then write the + // new segment to the directory according to + // currentFieldIndexOptions: final IndexOptions currentFieldIndexOptions = fieldInfo.indexOptions; + final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + + final boolean readTermFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + final boolean readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean readOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + + //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); + + // Make sure FieldInfo.update is working correctly!: + assert !writeTermFreq || readTermFreq; + assert !writePositions || readPositions; + assert !writeOffsets || readOffsets; + + assert !writeOffsets || writePositions; + final Map segDeletes; if (state.segDeletes != null && state.segDeletes.terms.size() > 0) { segDeletes = state.segDeletes.terms; @@ -268,12 +370,13 @@ for (int i = 0; i < numTerms; i++) { final int termID = termIDs[i]; + //System.out.println("term=" + termID); // Get BytesRef final int textStart = postings.textStarts[termID]; termsHashPerField.bytePool.setBytesRef(text, textStart); termsHashPerField.initReader(freq, termID, 0); - if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (readPositions || readOffsets) { termsHashPerField.initReader(prox, termID, 1); } @@ -303,15 +406,18 @@ int numDocs = 0; long totTF = 0; int docID = 0; - int termFreq = 0; while(true) { + //System.out.println(" cycle"); + final int termDocFreq; if (freq.eof()) { if (postings.lastDocCodes[termID] != -1) { // Return last doc docID = postings.lastDocIDs[termID]; - if (indexOptions != IndexOptions.DOCS_ONLY) { - termFreq = postings.docFreqs[termID]; + if (readTermFreq) { + termDocFreq = postings.docFreqs[termID]; + } else { + termDocFreq = 0; } postings.lastDocCodes[termID] = -1; } else { @@ -320,14 +426,15 @@ } } else { final int code = freq.readVInt(); - if (indexOptions == IndexOptions.DOCS_ONLY) { + if (!readTermFreq) { docID += code; + termDocFreq = 0; } else { docID += code >>> 1; if ((code & 1) != 0) { - termFreq = 1; + termDocFreq = 1; } else { - termFreq = freq.readVInt(); + termDocFreq = freq.readVInt(); } } @@ -336,7 +443,6 @@ numDocs++; assert docID < state.numDocs: "doc=" + docID + " maxDoc=" + state.numDocs; - final int termDocFreq = termFreq; // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat @@ -362,45 +468,54 @@ state.liveDocs.clear(docID); } - if (currentFieldIndexOptions != IndexOptions.DOCS_ONLY) { - totTF += termDocFreq; - } + totTF += termDocFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. - if (currentFieldIndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - // we do write positions & payload + if (readPositions || readOffsets) { + // we did record positions (& maybe payload) and/or offsets int position = 0; + int offset = 0; for(int j=0;j> 1; - - final int payloadLength; final BytesRef thisPayload; - if ((code & 1) != 0) { - // This position has a payload - payloadLength = prox.readVInt(); + if (readPositions) { + final int code = prox.readVInt(); + position += code >> 1; - if (payload == null) { - payload = new BytesRef(); - payload.bytes = new byte[payloadLength]; - } else if (payload.bytes.length < payloadLength) { - payload.grow(payloadLength); + if ((code & 1) != 0) { + + // This position has a payload + final int payloadLength = prox.readVInt(); + + if (payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[payloadLength]; + } else if (payload.bytes.length < payloadLength) { + payload.grow(payloadLength); + } + + prox.readBytes(payload.bytes, 0, payloadLength); + payload.length = payloadLength; + thisPayload = payload; + + } else { + thisPayload = null; } - prox.readBytes(payload.bytes, 0, payloadLength); - payload.length = payloadLength; - thisPayload = payload; - - } else { - payloadLength = 0; - thisPayload = null; + if (readOffsets) { + final int startOffset = offset + prox.readVInt(); + final int endOffset = startOffset + prox.readVInt(); + offset = startOffset; + if (writePositions) { + postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset); + } + } else if (writePositions) { + postingsConsumer.addPosition(position, thisPayload, -1, -1); + } } - - postingsConsumer.addPosition(position, thisPayload); } postingsConsumer.finishDoc(); @@ -413,6 +528,4 @@ termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); } - } - Index: lucene/src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInfo.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -38,13 +38,18 @@ * @lucene.experimental */ public static enum IndexOptions { + // NOTE: order is important here; FieldInfo uses this + // order to merge two conflicting IndexOptions (always + // "downgrades" by picking the lowest). /** only documents are indexed: term frequencies and positions are omitted */ // TODO: maybe rename to just DOCS? DOCS_ONLY, /** only documents and term frequencies are indexed: positions are omitted */ DOCS_AND_FREQS, - /** full postings: documents, frequencies, and positions */ - DOCS_AND_FREQS_AND_POSITIONS + /** documents, frequencies and positions */ + DOCS_AND_FREQS_AND_POSITIONS, + /** documents, frequencies, positions and offsets */ + DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, }; /** @@ -95,7 +100,10 @@ if (this.indexOptions != indexOptions) { // downgrade this.indexOptions = this.indexOptions.compareTo(indexOptions) < 0 ? this.indexOptions : indexOptions; - this.storePayloads = false; + if (this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // cannot store payloads if we don't store positions: + this.storePayloads = false; + } } } assert this.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !this.storePayloads; Index: lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java (working copy) @@ -126,6 +126,16 @@ } @Override + public int startOffset() throws IOException { + return current.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return current.endOffset(); + } + + @Override public boolean hasPayload() { return current.hasPayload(); } Index: lucene/src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexReader.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -788,9 +788,9 @@ /** Returns {@link DocsAndPositionsEnum} for the specified * field & term. This may return null, if either the - * field or term does not exist, or, positions were not - * indexed for this field. */ - public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term) throws IOException { + * field or term does not exist, or needsOffsets is + * true but offsets were not indexed for this field. */ + public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, boolean needsOffsets) throws IOException { assert field != null; assert term != null; final Fields fields = fields(); @@ -799,7 +799,7 @@ if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(term, true)) { - return termsEnum.docsAndPositions(liveDocs, null); + return termsEnum.docsAndPositions(liveDocs, null, needsOffsets); } } } @@ -830,8 +830,9 @@ * Returns {@link DocsAndPositionsEnum} for the specified field and * {@link TermState}. This may return null, if either the field or the term * does not exists, the {@link TermState} is invalid for the underlying - * implementation, or positions were not indexed for this field. */ - public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, TermState state) throws IOException { + * implementation, or needsOffsets is true but offsets + * were not indexed for this field. */ + public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, TermState state, boolean needsOffsets) throws IOException { assert state != null; assert field != null; final Fields fields = fields(); @@ -840,7 +841,7 @@ if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); termsEnum.seekExact(term, state); - return termsEnum.docsAndPositions(liveDocs, null); + return termsEnum.docsAndPositions(liveDocs, null, needsOffsets); } } return null; Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (working copy) @@ -418,7 +418,7 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { MultiDocsAndPositionsEnum docsAndPositionsEnum; // Can only reuse if incoming enum is also a MultiDocsAndPositionsEnum if (reuse != null && reuse instanceof MultiDocsAndPositionsEnum) { @@ -469,7 +469,7 @@ } assert entry.index < docsAndPositionsEnum.subDocsAndPositionsEnum.length: entry.index + " vs " + docsAndPositionsEnum.subDocsAndPositionsEnum.length + "; " + subs.length; - final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index]); + final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index], needsOffsets); if (subPostings != null) { docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index] = subPostings; @@ -479,8 +479,8 @@ } else { if (entry.terms.docs(b, null, false) != null) { // At least one of our subs does not store - // positions -- we can't correctly produce a - // MultiDocsAndPositions enum + // offsets or positions -- we can't correctly + // produce a MultiDocsAndPositions enum return null; } } Index: lucene/src/java/org/apache/lucene/index/MultiFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiFields.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/index/MultiFields.java (working copy) @@ -167,14 +167,14 @@ /** Returns {@link DocsAndPositionsEnum} for the specified * field & term. This may return null if the term does * not exist or positions were not indexed. */ - public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits liveDocs, String field, BytesRef term) throws IOException { + public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits liveDocs, String field, BytesRef term, boolean needsOffsets) throws IOException { assert field != null; assert term != null; final Terms terms = getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(term, true)) { - return termsEnum.docsAndPositions(liveDocs, null); + return termsEnum.docsAndPositions(liveDocs, null, needsOffsets); } } return null; Index: lucene/src/java/org/apache/lucene/document/Field.java =================================================================== --- lucene/src/java/org/apache/lucene/document/Field.java (revision 1231394) +++ lucene/src/java/org/apache/lucene/document/Field.java (working copy) @@ -83,7 +83,7 @@ if (tokenStream == null) { throw new NullPointerException("tokenStream cannot be null"); } - if (type.indexed() && !type.tokenized()) { + if (!type.indexed() || !type.tokenized()) { throw new IllegalArgumentException("Non-tokenized fields must use String values"); } Index: lucene/src/test-framework/java/org/apache/lucene/codecs/ramonly/RAMOnlyPostingsFormat.java =================================================================== --- lucene/src/test-framework/java/org/apache/lucene/codecs/ramonly/RAMOnlyPostingsFormat.java (revision 1231394) +++ lucene/src/test-framework/java/org/apache/lucene/codecs/ramonly/RAMOnlyPostingsFormat.java (working copy) @@ -37,6 +37,7 @@ import org.apache.lucene.codecs.TermsConsumer; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; @@ -197,6 +198,9 @@ @Override public TermsConsumer addField(FieldInfo field) { + if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { + throw new IllegalArgumentException("this codec cannot index offsets"); + } RAMField ramField = new RAMField(field.name); postings.fieldToTerms.put(field.name, ramField); termsConsumer.reset(ramField); @@ -265,7 +269,9 @@ } @Override - public void addPosition(int position, BytesRef payload) { + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) { + assert startOffset == -1; + assert endOffset == -1; current.positions[posUpto] = position; if (payload != null && payload.length > 0) { if (current.payloads == null) { @@ -388,7 +394,10 @@ } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) { + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { + if (needsOffsets) { + return null; + } return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), liveDocs); } } @@ -494,6 +503,16 @@ } @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override public boolean hasPayload() { return current.payloads != null && current.payloads[posUpto-1] != null; } Index: lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexFieldsWriter.java =================================================================== --- lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexFieldsWriter.java (revision 1231394) +++ lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexFieldsWriter.java (working copy) @@ -157,8 +157,10 @@ } @Override - public void addPosition(int position, BytesRef payload) throws IOException { + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { assert proxOut != null; + assert startOffset == -1; + assert endOffset == -1; //System.out.println(" w pos=" + position + " payl=" + payload); final int delta = position - lastPosition; Index: lucene/src/test-framework/java/org/apache/lucene/analysis/CannedAnalyzer.java =================================================================== --- lucene/src/test-framework/java/org/apache/lucene/analysis/CannedAnalyzer.java (revision 0) +++ lucene/src/test-framework/java/org/apache/lucene/analysis/CannedAnalyzer.java (working copy) @@ -0,0 +1,73 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.IOException; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +public class CannedAnalyzer extends Analyzer { + private final Token[] tokens; + + public CannedAnalyzer(Token[] tokens) { + this.tokens = tokens; + } + + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new CannedTokenizer(tokens)); + } + + public static class CannedTokenizer extends Tokenizer { + private final Token[] tokens; + private int upto = 0; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + public CannedTokenizer(Token[] tokens) { + this.tokens = tokens; + } + + @Override + public final boolean incrementToken() throws IOException { + clearAttributes(); + if (upto < tokens.length) { + final Token token = tokens[upto++]; + // TODO: can we just capture/restoreState so + // we get all attrs...? + termAtt.setEmpty(); + termAtt.append(token.toString()); + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } else { + return false; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + this.upto = 0; + } + } +} Property changes on: lucene/src/test-framework/java/org/apache/lucene/analysis/CannedAnalyzer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/src/test-framework/java/org/apache/lucene/index/RandomIndexWriter.java =================================================================== --- lucene/src/test-framework/java/org/apache/lucene/index/RandomIndexWriter.java (revision 1231394) +++ lucene/src/test-framework/java/org/apache/lucene/index/RandomIndexWriter.java (working copy) @@ -28,7 +28,6 @@ import org.apache.lucene.document.DocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; // javadoc -import org.apache.lucene.index.DocValues; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -122,6 +121,10 @@ * @see IndexWriter#addDocument(Iterable) */ public void addDocument(final Iterable doc) throws IOException { + addDocument(doc, w.getAnalyzer()); + } + + public void addDocument(final Iterable doc, Analyzer a) throws IOException { if (doDocValues && doc instanceof Document) { randomPerDocFieldValues(r, (Document) doc); } @@ -157,9 +160,9 @@ } }; } - }); + }, a); } else { - w.addDocument(doc); + w.addDocument(doc, a); } maybeCommit(); Index: lucene/src/test-framework/java/org/apache/lucene/util/LuceneTestCase.java =================================================================== --- lucene/src/test-framework/java/org/apache/lucene/util/LuceneTestCase.java (revision 1231394) +++ lucene/src/test-framework/java/org/apache/lucene/util/LuceneTestCase.java (working copy) @@ -1106,6 +1106,10 @@ return new Field(name, value, type); } + // TODO: once all core & test codecs can index + // offsets, sometimes randomly turn on offsets if we are + // already indexing positions... + FieldType newType = new FieldType(type); if (!newType.stored() && random.nextBoolean()) { newType.setStored(true); // randomly store it Index: lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (revision 1231394) +++ lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (working copy) @@ -567,7 +567,10 @@ if (random.nextBoolean()) { if (random.nextBoolean()) { // TODO: cast re-use to D&PE if we can...? - final DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null); + DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, true); + if (docsAndPositions == null) { + docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, false); + } if (docsAndPositions != null) { return docsAndPositions; } @@ -586,7 +589,10 @@ if (random.nextBoolean()) { if (random.nextBoolean()) { // TODO: cast re-use to D&PE if we can...? - final DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null); + DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, true); + if (docsAndPositions == null) { + docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, false); + } if (docsAndPositions != null) { return docsAndPositions; } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (revision 1231394) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (working copy) @@ -60,22 +60,23 @@ BytesRef text; DocsAndPositionsEnum dpEnum = null; while((text = termsEnum.next()) != null) { - dpEnum = termsEnum.docsAndPositions(null, dpEnum); + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); + final boolean hasOffsets; + if (dpEnum == null) { + hasOffsets = false; + dpEnum = termsEnum.docsAndPositions(null, dpEnum, false); + } else { + hasOffsets = true; + } dpEnum.nextDoc(); final int freq = dpEnum.freq(); - final OffsetAttribute offsetAtt; - if (dpEnum.attributes().hasAttribute(OffsetAttribute.class)) { - offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); - } else { - offsetAtt = null; - } for (int j = 0; j < freq; j++) { int pos = dpEnum.nextPosition(); Token token; - if (offsetAtt != null) { + if (hasOffsets) { token = new Token(text.utf8ToString(), - offsetAtt.startOffset(), - offsetAtt.endOffset()); + dpEnum.startOffset(), + dpEnum.endOffset()); } else { token = new Token(); token.setEmpty().append(text.utf8ToString()); Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1231394) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy) @@ -126,7 +126,7 @@ private static boolean hasPositions(Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(null); if (termsEnum.next() != null) { - DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); + DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false); if (dpEnum != null) { int pos = dpEnum.nextPosition(); if (pos >= 0) { @@ -219,22 +219,21 @@ DocsAndPositionsEnum dpEnum = null; while ((text = termsEnum.next()) != null) { - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - if (dpEnum == null || (!dpEnum.attributes().hasAttribute(OffsetAttribute.class))) { + dpEnum = termsEnum.docsAndPositions(null, dpEnum, true); + if (dpEnum == null) { throw new IllegalArgumentException( "Required TermVector Offset information was not found"); } final String term = text.utf8ToString(); - final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class); dpEnum.nextDoc(); final int freq = dpEnum.freq(); for(int posUpto=0;posUpto theOffsets = null; @@ -317,8 +315,8 @@ } if (theOffsets != null) { - theOffsets.add("start", offsetAtt.startOffset()); - theOffsets.add("end", offsetAtt.endOffset()); + theOffsets.add("start", dpEnum.startOffset()); + theOffsets.add("end", dpEnum.endOffset()); } } }