Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java =================================================================== --- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java (revision 0) +++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java (revision 0) @@ -0,0 +1,268 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.BulkPostingsEnum; + +public class InstantiatedBulkPostingsEnum extends BulkPostingsEnum { + + private final DocDeltasReader docDeltasReader; + private final FreqsReader freqsReader; + private final PositionDeltasReader positionDeltasReader; + private final String field; + + private InstantiatedTerm term; + + public InstantiatedBulkPostingsEnum(String field, boolean doFreq, boolean doPositions) { + this.field = field; + docDeltasReader = new DocDeltasReader(); + if (doFreq) { + freqsReader = new FreqsReader(); + } else { + freqsReader = null; + } + + if (doPositions) { + positionDeltasReader = new PositionDeltasReader(); + } else { + positionDeltasReader = null; + } + } + + public boolean canReuse(String field, boolean doFreq, boolean doPositions) { + return field.equals(this.field) && (doFreq == (freqsReader != null)) && (doPositions == (positionDeltasReader != null)); + } + + private class DocDeltasReader extends BlockReader { + private final int[] buffer = new int[64]; + private InstantiatedTermDocumentInformation[] docs; + private int docUpto; + private int lastDocID; + private int limit; + + public void reset(InstantiatedTerm term) { + docUpto = 0; + lastDocID = 0; + docs = term.getAssociatedDocuments(); + fill(); + } + + public void jump(int docUpto, int lastDocID) { + this.lastDocID = lastDocID; + this.docUpto = docUpto; + this.limit = 0; + } + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int v) { + throw new UnsupportedOperationException(); + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() { + final int chunk = Math.min(buffer.length, docs.length-docUpto); + for(int i=0;i getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } Index: contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 1044119) +++ contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -43,6 +43,7 @@ import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; @@ -907,6 +908,17 @@ } @Override + public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) { + MemoryBulkPostingsEnum postingsEnum; + if (reuse == null || !(reuse instanceof MemoryBulkPostingsEnum) || !((MemoryBulkPostingsEnum) reuse).canReuse(info, doFreqs, doPositions)) { + postingsEnum = new MemoryBulkPostingsEnum(info, doFreqs, doPositions); + } else { + postingsEnum = (MemoryBulkPostingsEnum) reuse; + } + return postingsEnum.reset(info.sortedTerms[termUpto].getValue()); + } + + @Override public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @@ -1004,6 +1016,173 @@ return null; } } + + private class MemoryBulkPostingsEnum extends BulkPostingsEnum { + + private final DocDeltasReader docDeltasReader; + private final FreqsReader freqsReader; + private final PositionDeltasReader positionDeltasReader; + private final Info info; + + public MemoryBulkPostingsEnum(Info info, boolean doFreqs, boolean doPositions) { + this.info = info; + docDeltasReader = new DocDeltasReader(); + if (doFreqs) { + freqsReader = new FreqsReader(); + } else { + freqsReader = null; + } + + if (doPositions) { + positionDeltasReader = new PositionDeltasReader(); + } else { + positionDeltasReader = null; + } + } + + public boolean canReuse(Info info, boolean doFreq, boolean doPositions) { + return this.info == info && (doFreq == (freqsReader != null)) && (doPositions == (positionDeltasReader != null)); + } + + private class DocDeltasReader extends BlockReader { + private final int[] buffer = new int[1]; + + public void reset() { + } + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + throw new UnsupportedOperationException(); + } + + @Override + public int end() { + return 1; + } + + @Override + public int fill() { + return 1; + } + } + + private class FreqsReader extends BlockReader { + private final int[] buffer = new int[1]; + + public void reset(int freq) { + buffer[0] = freq; + } + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + throw new UnsupportedOperationException(); + } + + @Override + public int end() { + return 1; + } + + @Override + public int fill() { + return 1; + } + } + + private class PositionDeltasReader extends BlockReader { + private final int[] buffer = new int[64]; + private ArrayIntList positions; + private int posUpto; + private int limit; + + public void reset(ArrayIntList positions) { + posUpto = 0; + this.positions = positions; + fill(); + } + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + throw new UnsupportedOperationException(); + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() { + final int chunk = Math.min(buffer.length, positions.size() - posUpto); + for(int i=0;i= end) { + offset = 0; + end = fill(); + if (offset >= end) { + // nocommit cleanup + throw new IOException("no more ints"); + } + } + setOffset(1+offset); + return buffer[offset]; + } + + /** Reads long as 1 or 2 ints, and can only use 61 of + * the 64 long bits. */ + public long readVLong() throws IOException { + int offset = offset(); + + final int v = next(); + if ((v & 1) == 0) { + return v >> 1; + } else { + final long v2 = next(); + return (v2 << 30) | (v >> 1); + } + } + } + + public abstract BlockReader getDocDeltasReader() throws IOException; + + /** Returns null if per-document term freq is not indexed */ + public abstract BlockReader getFreqsReader() throws IOException; + + /** Returns null if positions are not indexed */ + public abstract BlockReader getPositionDeltasReader() throws IOException; + + public static class JumpResult { + public int count; + public int docID; + } + + /** Only call this if the docID you seek is after the last + * document in the buffer. This call does not position + * exactly; instead, it jumps forward when possible, + * returning the docID and ord it had jumped to, seeking + * all of the BlockReaders accordingly. Note that if a + * seek did occur, you must call .offset() and .limit() + * on each BlockReader. If null is returned then + * skipping is not possible, ie you should just scan + * yourself). */ + abstract public JumpResult jump(int target, int curCount) throws IOException; +} Property changes on: src/java/org/apache/lucene/index/BulkPostingsEnum.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- src/java/org/apache/lucene/index/CheckIndex.java (revision 1044119) +++ src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -770,7 +770,7 @@ } if (totDocCount != totDocCount2) { - throw new RuntimeException("search to seek terms produced wrong number of hits: " + totDocCount + " vs " + totDocCount2); + throw new RuntimeException("search by seek term produced wrong number of hits: " + totDocCount + " vs " + totDocCount2 + " field=" + field); } } } Index: src/java/org/apache/lucene/index/DocsEnum.java =================================================================== --- src/java/org/apache/lucene/index/DocsEnum.java (revision 1044119) +++ src/java/org/apache/lucene/index/DocsEnum.java (working copy) @@ -44,6 +44,8 @@ return atts; } + // nocommit -- delete all after here: + // TODO: maybe add bulk read only docIDs (for eventual // match-only scoring) @@ -83,6 +85,7 @@ *

NOTE: the default impl simply delegates to {@link * #nextDoc}, but subclasses may do this more * efficiently. */ + // nocommit -- remove this public int read() throws IOException { int count = 0; final int[] docs = bulkResult.docs.ints; Index: src/java/org/apache/lucene/index/FilterIndexReader.java =================================================================== --- src/java/org/apache/lucene/index/FilterIndexReader.java (revision 1044119) +++ src/java/org/apache/lucene/index/FilterIndexReader.java (working copy) @@ -170,11 +170,18 @@ } @Override + public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + return in.bulkPostings(reuse, doFreqs, doPositions); + } + + @Override public Comparator getComparator() throws IOException { return in.getComparator(); } } + // nocommit need FilteredBulkDocsEnum + /** Base class for filtering {@link DocsEnum} implementations. */ public static class FilterDocsEnum extends DocsEnum { protected DocsEnum in; Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 1044119) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -1048,6 +1048,22 @@ } } + // nocommit jdocs + public BulkPostingsEnum bulkTermPostingsEnum(String field, BytesRef term, boolean doFreqs, boolean doPositions) throws IOException { + assert field != null; + assert term != null; + final Fields fields = fields(); + if (fields == null) { + return null; + } + final Terms terms = fields.terms(field); + if (terms != null) { + return terms.bulkPostings(term, null, doFreqs, doPositions); + } else { + return null; + } + } + /** Returns {@link DocsAndPositionsEnum} for the specified * field & term. This may return null, if either the * field or term does not exist, or, positions were not Index: src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java =================================================================== --- src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java (revision 0) @@ -0,0 +1,274 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; + +public final class MultiBulkPostingsEnum extends BulkPostingsEnum { + private EnumWithSlice[] subs; + int numSubs; + + private final DocDeltasReader docDeltasReader = new DocDeltasReader(); + private final FreqsReader freqsReader = new FreqsReader(); + private final PositionsReader positionsReader = new PositionsReader(); + + MultiBulkPostingsEnum reset(final EnumWithSlice[] subs, final int numSubs, boolean doFreqs, boolean doPositions) throws IOException { + this.numSubs = numSubs; + this.subs = new EnumWithSlice[subs.length]; + for(int i=0;i offset) { + return doCopy(offset, limit); + } + } + } + int limit = current.fill(); + //int offset = current.offset(); + return doCopy(0, limit); + } + } + + protected abstract BlockReader getBlockReader(int upto) throws IOException; + protected void onFill() {}; + } + + private class DocDeltasReader extends MultiBlockReader { + int lastDocID; + int lastSeg; + + @Override + protected int getBufferSize() throws IOException { + int maxBufferSize = 0; + for(int sub=0;sub 0; - + seekDir(in, dirOffset); // Read directory @@ -367,7 +367,6 @@ int lo = 0; // binary search int hi = numIndexTerms - 1; assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval; - while (hi >= lo) { int mid = (lo + hi) >>> 1; Index: src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java =================================================================== --- src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java (working copy) @@ -21,6 +21,7 @@ import java.io.Closeable; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.store.IndexInput; @@ -50,6 +51,12 @@ * TermState may be reused. */ public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException; + // nocommit jdocs + // nocommit make abstract + public BulkPostingsEnum bulkPostings(FieldInfo fieldInfo, TermState state, BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + throw new UnsupportedOperationException(); + } + /** Must fully consume state, since after this call that * TermState may be reused. */ public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; Index: src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java =================================================================== --- src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (working copy) @@ -25,6 +25,7 @@ import java.util.Comparator; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -319,9 +320,9 @@ @Override public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { // Check cache - fieldTerm.term = term; TermState cachedState; if (useCache) { + fieldTerm.term = term; cachedState = termsCache.get(fieldTerm); if (cachedState != null) { state.copy(cachedState); @@ -387,7 +388,6 @@ while(next() != null) { final int cmp = termComp.compare(bytesReader.term, term); if (cmp == 0) { - if (doSeek && useCache) { // Store in cache FieldAndTerm entryKey = new FieldAndTerm(fieldTerm); @@ -396,7 +396,6 @@ cachedState.filePointer = in.getFilePointer(); termsCache.put(entryKey, cachedState); } - return SeekStatus.FOUND; } else if (cmp > 0) { return SeekStatus.NOT_FOUND; @@ -500,6 +499,12 @@ } @Override + public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + BulkPostingsEnum postingsEnum = postingsReader.bulkPostings(fieldInfo, state, reuse, doFreqs, doPositions); + return postingsEnum; + } + + @Override public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { if (fieldInfo.omitTermFreqAndPositions) { return null; Index: src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (working copy) @@ -178,7 +178,7 @@ termWriter.write(text); out.writeVInt(numDocs); - + //System.out.println("term=" + text.utf8ToString() + " df=" + numDocs); postingsWriter.finishTerm(numDocs, isIndexTerm); numTerms++; } Index: src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java =================================================================== --- src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (working copy) @@ -24,8 +24,8 @@ import java.io.IOException; import org.apache.lucene.index.codecs.sep.IntIndexInput; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.IntsRef; /** Abstract base class that reads fixed-size blocks of ints * from an IndexInput. While this is a simple approach, a @@ -42,7 +42,9 @@ public FixedIntBlockIndexInput(final IndexInput in) throws IOException { this.in = in; - blockSize = in.readVInt(); + //blockSize = in.readVInt(); + blockSize = in.readInt(); + //System.out.println("BLOCK size " + blockSize); } @Override @@ -67,80 +69,72 @@ public interface BlockReader { public void readBlock() throws IOException; + // nocommit -- need seek here so mmapdir "knows" } - private static class Reader extends IntIndexInput.Reader { + private static class Reader extends BulkPostingsEnum.BlockReader { private final IndexInput in; protected final int[] pending; - int upto; + private int offset; - private boolean seekPending; - private long pendingFP; - private int pendingUpto; private long lastBlockFP; private final BlockReader blockReader; private final int blockSize; - private final IntsRef bulkResult = new IntsRef(); public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) - throws IOException { + throws IOException { this.in = in; this.pending = pending; this.blockSize = pending.length; - bulkResult.ints = pending; this.blockReader = blockReader; - upto = blockSize; } - void seek(final long fp, final int upto) { - pendingFP = fp; - pendingUpto = upto; - seekPending = true; + void seek(final long fp, final int upto) throws IOException { + offset = upto; + if (fp != lastBlockFP) { + // Seek to new block; this may in fact be the next + // block ie when caller is doing sequential scan (eg + // PrefixQuery) + //System.out.println(" seek block fp=" + fp + " vs last=" + lastBlockFP + " upto=" + upto); + in.seek(fp); + fill(); + } else { + // Seek within current block + //System.out.println(" seek in-block fp=" + fp + " upto=" + offset); + } } - private void maybeSeek() throws IOException { - if (seekPending) { - if (pendingFP != lastBlockFP) { - // need new block - in.seek(pendingFP); - lastBlockFP = pendingFP; - blockReader.readBlock(); - } - upto = pendingUpto; - seekPending = false; - } + @Override + public int[] getBuffer() { + return pending; } @Override - public int next() throws IOException { - this.maybeSeek(); - if (upto == blockSize) { - lastBlockFP = in.getFilePointer(); - blockReader.readBlock(); - upto = 0; - } + public int end() { + return blockSize; + } - return pending[upto++]; + @Override + public int offset() { + return offset; } @Override - public IntsRef read(final int count) throws IOException { - this.maybeSeek(); - if (upto == blockSize) { - blockReader.readBlock(); - upto = 0; - } - bulkResult.offset = upto; - if (upto + count < blockSize) { - bulkResult.length = count; - upto += count; - } else { - bulkResult.length = blockSize - upto; - upto = blockSize; - } + public void setOffset(int offset) { + this.offset = offset; + } - return bulkResult; + @Override + public int fill() throws IOException { + //System.out.println("fii.fill seekPending=" + seekPending + " set lastFP=" + pendingFP + " this=" + this); + // nocommit -- not great that we do this on each + // fill -- but we need it to detect seek w/in block + // case: + // nocommit: can't we += blockNumBytes instead? + lastBlockFP = in.getFilePointer(); + blockReader.readBlock(); + return blockSize; } } @@ -150,10 +144,14 @@ @Override public void read(final IndexInput indexIn, final boolean absolute) throws IOException { + // nocommit -- somehow we should share the "upto" for + // doc & freq since they will always be "in sync" if (absolute) { fp = indexIn.readVLong(); upto = indexIn.readVInt(); } else { + // nocommit -- can't this be more efficient? read a + // single byte and check a bit? block size is 128... final long delta = indexIn.readVLong(); if (delta == 0) { // same block @@ -168,7 +166,7 @@ } @Override - public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException { + public void read(final BulkPostingsEnum.BlockReader indexIn, final boolean absolute) throws IOException { if (absolute) { fp = indexIn.readVLong(); upto = indexIn.next(); @@ -187,7 +185,7 @@ } @Override - public void seek(final IntIndexInput.Reader other) throws IOException { + public void seek(final BulkPostingsEnum.BlockReader other) throws IOException { ((Reader) other).seek(fp, upto); } @@ -205,5 +203,10 @@ other.upto = upto; return other; } + + @Override + public String toString() { + return "FixedBlockIndex(fp=" + fp + " offset=" + upto + ")"; + } } } Index: src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java =================================================================== --- src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (working copy) @@ -44,7 +44,7 @@ protected FixedIntBlockIndexOutput(IndexOutput out, int fixedBlockSize) throws IOException { blockSize = fixedBlockSize; this.out = out; - out.writeVInt(blockSize); + out.writeInt(blockSize); buffer = new int[blockSize]; } @@ -111,6 +111,11 @@ lastUpto = upto; lastFP = fp; } + + @Override + public String toString() { + return "fp=" + fp + " idx=" + upto; + } } @Override Index: src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java =================================================================== --- src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java (working copy) @@ -23,9 +23,9 @@ import java.io.IOException; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.IntsRef; // TODO: much of this can be shared code w/ the fixed case @@ -72,94 +72,97 @@ public void seek(long pos) throws IOException; } - public static class Reader extends IntIndexInput.Reader { + public static class Reader extends BulkPostingsEnum.BlockReader { private final IndexInput in; public final int[] pending; - int upto; private boolean seekPending; private long pendingFP; - private int pendingUpto; + private int offset; private long lastBlockFP; private int blockSize; private final BlockReader blockReader; - private final IntsRef bulkResult = new IntsRef(); + private int limit; public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) throws IOException { this.in = in; this.pending = pending; - bulkResult.ints = pending; this.blockReader = blockReader; } void seek(final long fp, final int upto) throws IOException { + //System.out.println("vintb seek fp=" + fp + " upto=" + upto); // TODO: should we do this in real-time, not lazy? pendingFP = fp; - pendingUpto = upto; - assert pendingUpto >= 0: "pendingUpto=" + pendingUpto; - seekPending = true; - } + offset = upto; + assert offset >= 0: "pendingUpto=" + offset; + if (pendingFP != lastBlockFP) { + // Clear current block + seekPending = true; + // System.out.println(" seekPending=true now fill"); + fill(); + } else { + //System.out.println(" no seekPending"); + } + //System.out.println(" now offset=" + offset + " limit=" + limit); - private final void maybeSeek() throws IOException { - if (seekPending) { - if (pendingFP != lastBlockFP) { - // need new block - in.seek(pendingFP); - blockReader.seek(pendingFP); - lastBlockFP = pendingFP; - blockSize = blockReader.readBlock(); - } - upto = pendingUpto; + // This is necessary for int encoders that are + // non-causal, ie must see future int values to + // encode the current ones. + while(offset >= limit) { + offset -= limit; + //System.out.println(" non-causal fill"); + fill(); + } + //System.out.println(" after skip bock offset=" + offset); + } - // TODO: if we were more clever when writing the - // index, such that a seek point wouldn't be written - // until the int encoder "committed", we could avoid - // this (likely minor) inefficiency: + @Override + public int[] getBuffer() { + return pending; + } - // This is necessary for int encoders that are - // non-causal, ie must see future int values to - // encode the current ones. - while(upto >= blockSize) { - upto -= blockSize; - lastBlockFP = in.getFilePointer(); - blockSize = blockReader.readBlock(); - } - seekPending = false; - } + @Override + public int end() { + return limit; } @Override - public int next() throws IOException { - this.maybeSeek(); - if (upto == blockSize) { - lastBlockFP = in.getFilePointer(); - blockSize = blockReader.readBlock(); - upto = 0; - } + public int offset() { + return offset; + } - return pending[upto++]; + @Override + public void setOffset(int offset) { + this.offset = offset; } @Override - public IntsRef read(final int count) throws IOException { - this.maybeSeek(); - if (upto == blockSize) { - lastBlockFP = in.getFilePointer(); + public int fill() throws IOException { + if (seekPending) { + seekPending = false; + in.seek(pendingFP); + blockReader.seek(pendingFP); + lastBlockFP = pendingFP; blockSize = blockReader.readBlock(); - upto = 0; - } - bulkResult.offset = upto; - if (upto + count < blockSize) { - bulkResult.length = count; - upto += count; + + // TODO: if we were more clever when writing the + // index, such that a seek point wouldn't be written + // until the int encoder "committed", we could avoid + // this (likely minor) inefficiency: + + //System.out.println("varintblock.fill offset=" + offset + " vs blockSize=" + blockSize); + } else { - bulkResult.length = blockSize - upto; - upto = blockSize; + // nocommit -- not great that we do this on each + // fill -- but we need it to detect seek w/in block + // case: + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); } - - return bulkResult; + return limit = blockSize; } } @@ -189,7 +192,7 @@ } @Override - public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException { + public void read(final BulkPostingsEnum.BlockReader indexIn, final boolean absolute) throws IOException { if (absolute) { fp = indexIn.readVLong(); upto = indexIn.next()&0xFF; @@ -212,7 +215,7 @@ } @Override - public void seek(final IntIndexInput.Reader other) throws IOException { + public void seek(final BulkPostingsEnum.BlockReader other) throws IOException { ((Reader) other).seek(fp, upto); } Index: src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldsEnum; @@ -42,6 +43,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.index.codecs.standard.DefaultSkipListReader; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental @@ -971,6 +973,17 @@ } return docsPosEnum.reset(termEnum, skipDocs); } + + @Override + public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + PreBulkPostingsEnum postingsEnum; + if (reuse == null || !(reuse instanceof PreBulkPostingsEnum) || !((PreBulkPostingsEnum) reuse).canReuse(fieldInfo, freqStream, doFreqs, doPositions)) { + postingsEnum = new PreBulkPostingsEnum(fieldInfo.omitTermFreqAndPositions, doFreqs, doPositions); + } else { + postingsEnum = (PreBulkPostingsEnum) reuse; + } + return postingsEnum.reset(fieldInfo, termEnum); + } } private final class PreDocsEnum extends DocsEnum { @@ -1103,4 +1116,324 @@ return payload; } } + + static final int BULK_BUFFER_SIZE = 64; + + // Bulk postings API + private final class PreBulkPostingsEnum extends BulkPostingsEnum { + private final IndexInput freqIn; + private final IndexInput proxIn; + + final IndexInput startFreqIn; + private final boolean omitTF; + + boolean storePayloads; // does current field store payloads? + + int ord; // how many docs we've read + int docFreq; + + long freqOffset; + long proxOffset; + int skipOffset; + + boolean skipped; + DefaultSkipListReader skipper; + private int payloadLength; + + private final DocDeltasReader docDeltasReader; + private final FreqsReader freqsReader; + private final PositionsReader positionDeltasReader; + + private boolean docsPending, freqsPending; + + public PreBulkPostingsEnum(boolean omitTF, boolean doFreqs, boolean doPositions) throws IOException { + startFreqIn = PreFlexFields.this.freqStream; + this.freqIn = (IndexInput) PreFlexFields.this.freqStream.clone(); + this.omitTF = omitTF; + + docDeltasReader = new DocDeltasReader(); + if (doFreqs && !omitTF) { + freqsReader = new FreqsReader(); + } else { + freqsReader = null; + } + + if (doPositions && !omitTF) { + this.proxIn = (IndexInput) PreFlexFields.this.proxStream.clone(); + positionDeltasReader = new PositionsReader(); + } else { + this.proxIn = null; + positionDeltasReader = null; + } + } + + public boolean canReuse(FieldInfo fieldInfo, IndexInput freqin, boolean doFreqs, boolean doPositions) { + return freqIn == startFreqIn && + (!doFreqs || freqsReader == null) && + (!doPositions || positionDeltasReader == null) && + (omitTF == fieldInfo.omitTermFreqAndPositions); + } + + final void read() throws IOException { + try { + if (freqsReader == null) { + // Consumer only wants doc deltas + assert !docsPending; + if (omitTF) { + // Index only stores doc deltas + for(int i=0;i>> 1; + if ((code & 1) == 0) { + freqIn.readVInt(); + } + } + } + docsPending = true; + } else { + // Consumer wants both + assert !docsPending; + assert !freqsPending; + for(int i=0;i>> 1; + if ((code & 1) == 0) { + freqsReader.buffer[i] = freqIn.readVInt(); + } else { + freqsReader.buffer[i] = 1; + } + } + docsPending = true; + freqsPending = true; + } + ord += BULK_BUFFER_SIZE; + } catch (IOException ioe) { + if (freqIn.getFilePointer() != freqIn.length()) { + throw ioe; + } + } + } + + class DocDeltasReader extends BulkPostingsEnum.BlockReader { + private final int[] buffer = new int[BULK_BUFFER_SIZE]; + private int limit; + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + if (!docsPending) { + read(); + } + docsPending = false; + limit = BULK_BUFFER_SIZE; + return BULK_BUFFER_SIZE; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + assert offset == 0; + } + } + + class FreqsReader extends BulkPostingsEnum.BlockReader { + private final int[] buffer = new int[BULK_BUFFER_SIZE]; + private int limit; + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + if (!freqsPending) { + read(); + } + freqsPending = false; + limit = BULK_BUFFER_SIZE; + return BULK_BUFFER_SIZE; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + assert offset == 0; + } + } + + class PositionsReader extends BulkPostingsEnum.BlockReader { + final int[] buffer = new int[BULK_BUFFER_SIZE]; + int limit; + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + // nocommit -- must "handle" EOF here -- cannot + // change old index format! + if (storePayloads) { + for(int i=0;i>> 1; + if ((code & 1) != 0) { + payloadLength = proxIn.readVInt(); + } + if (payloadLength != 0) { + // skip payload + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + } + } else { + for(int i=0;i 0) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), tis.getMaxSkipLevels(), tis.getSkipInterval()); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped since reset() was called, so now we + // load the skip data for this posting + skipper.init(freqOffset + skipOffset, + freqOffset, proxOffset, + docFreq, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + // nocommit rename ord -> count + assert curCount == ord: "ord=" + ord + " curCount=" + curCount; + + if (newOrd > ord) { + + // Skipper moved + freqIn.seek(skipper.getFreqPointer()); + if (freqsReader != null) { + freqsReader.limit = 0; + } + docDeltasReader.limit = 0; + + if (positionDeltasReader != null) { + positionDeltasReader.limit = 0; + proxIn.seek(skipper.getProxPointer()); + } + + jumpResult.count = ord = newOrd; + jumpResult.docID = skipper.getDoc(); + + return jumpResult; + } + } + + // no jump occurred + return null; + } + } } Index: src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java =================================================================== --- src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.codecs.TermState; @@ -30,6 +31,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.ArrayUtil; /** Concrete class that reads the current doc/freq/skip * postings format @@ -200,6 +202,27 @@ } } + // TODO: we could actually reuse, by having TL that + // holds the last wrapped reuse, and vice-versa + @Override + public BulkPostingsEnum bulkPostings(FieldInfo field, TermState _termState, BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + PulsingTermState termState = (PulsingTermState) _termState; + if (termState.docFreq <= maxPulsingDocFreq) { + if (reuse instanceof PulsingBulkPostingsEnum && ((PulsingBulkPostingsEnum) reuse).docDeltas.length == maxPulsingDocFreq) { + return ((PulsingBulkPostingsEnum) reuse).reset(termState, doFreqs, doPositions); + } else { + PulsingBulkPostingsEnum postingsEnum = new PulsingBulkPostingsEnum(maxPulsingDocFreq); + return postingsEnum.reset(termState, doFreqs, doPositions); + } + } else { + if (reuse instanceof PulsingBulkPostingsEnum) { + return wrappedPostingsReader.bulkPostings(field, termState.wrappedTermState, null, doFreqs, doPositions); + } else { + return wrappedPostingsReader.bulkPostings(field, termState.wrappedTermState, reuse, doFreqs, doPositions); + } + } + } + // TODO: -- not great that we can't always reuse @Override public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { @@ -226,8 +249,6 @@ private Document doc; private PulsingTermState state; - public void close() {} - PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) { // TODO: -- not great we have to clone here -- // merging is wasteful; TermRangeQuery too @@ -290,6 +311,159 @@ } } + static class PulsingBulkPostingsEnum extends BulkPostingsEnum { + private Document doc; + private PulsingTermState state; + private int numDocs; + private final int[] docDeltas; + private final int[] freqs; + private int[] positionDeltas; + private int numPositions; + private boolean doFreqs; + private boolean doPositions; + + public PulsingBulkPostingsEnum(int maxFreq) { + docDeltas = new int[maxFreq]; + freqs = new int[maxFreq]; + positionDeltas = new int[maxFreq]; + } + + PulsingBulkPostingsEnum reset(PulsingTermState termState, boolean doFreqs, boolean doPositions) { + numDocs = termState.docFreq; + this.doFreqs = doFreqs; + this.doPositions = doPositions; + assert numDocs <= docDeltas.length; + int lastDocID = 0; + numPositions = 0; + for(int i=0;i 0; + if (doPositions) { + final Position[] positions = termState.docs[i].positions; + int lastPos = 0; + for(int posIndex=0;posIndex> 1; - } else { - final long v2 = next(); - return (v2 << 30) | (v >> 1); - } - } - - /** Reads next chunk of ints */ - private IntsRef bulkResult; - - /** Read up to count ints. */ - public IntsRef read(int count) throws IOException { - if (bulkResult == null) { - bulkResult = new IntsRef(); - bulkResult.ints = new int[count]; - } else { - bulkResult.grow(count); - } - for(int i=0;i= docDeltaLimit) { + docDeltaLimit = docReader.fill(); + } if (!omitTF) { freqIndex.read(docReader, true); freqIndex.seek(freqReader); + freqUpto = freqReader.offset(); + freqLimit = freqReader.end(); + if (freqUpto >= freqLimit) { + freqLimit = freqReader.fill(); + } + //System.out.println(" freqIndex=" + freqIndex + " posIndex=" + posIndex); posIndex.read(docReader, true); + // nocommit -- only store this if storePayloads is true // skip payload offset docReader.readVLong(); } else { freq = 1; } + skipOffset = docReader.readVLong(); + docDeltaUpto = docReader.offset(); + docDeltaLimit = docReader.end(); + docFreq = termState.docFreq; + assert docFreq > 0; count = 0; doc = 0; skipped = false; + //System.out.println(" docFreq=" + docFreq); return this; } + public boolean canReuse(IntIndexInput docsIn) { + return startDocIn == docsIn; + } + @Override public int nextDoc() throws IOException { + //System.out.println(" sep.nextDoc"); while(true) { if (count == docFreq) { return doc = NO_MORE_DOCS; } + assert docDeltaUpto <= docDeltaLimit: "docDeltaUpto=" + docDeltaUpto + " docDeltaLimit=" + docDeltaLimit; + + if (docDeltaUpto == docDeltaLimit) { + // refill + //System.out.println(" fill docs"); + docDeltaLimit = docReader.fill(); + docDeltaUpto = 0; + } + count++; // Decode next doc - doc += docReader.next(); + doc += docDeltaBuffer[docDeltaUpto++]; + //System.out.println(" doc="+ doc + " docDeltaUpto=" + (docDeltaUpto-1) + " skipDocs=" + skipDocs + " deleted?=" + (skipDocs != null && skipDocs.get(doc))); if (!omitTF) { - freq = freqReader.next(); + if (freqUpto == freqLimit) { + // refill + //System.out.println(" fill freqs"); + freqLimit = freqReader.fill(); + freqUpto = 0; + } + + freq = freqBuffer[freqUpto++]; } if (skipDocs == null || !skipDocs.get(doc)) { @@ -303,30 +355,6 @@ } @Override - public int read() throws IOException { - // TODO: -- switch to bulk read api in IntIndexInput - final int[] docs = bulkResult.docs.ints; - final int[] freqs = bulkResult.freqs.ints; - int i = 0; - final int length = docs.length; - while (i < length && count < docFreq) { - count++; - // manually inlined call to next() for speed - doc += docReader.next(); - if (!omitTF) { - freq = freqReader.next(); - } - - if (skipDocs == null || !skipDocs.get(doc)) { - docs[i] = doc; - freqs[i] = freq; - i++; - } - } - return i; - } - - @Override public int freq() { return freq; } @@ -338,9 +366,11 @@ @Override public int advance(int target) throws IOException { + //System.out.println("SepDocsEnum.advance target=" + target); // TODO: jump right to next() if target is < X away // from where we are now? + //System.out.println("SepDocsEnum.advance target=" + target); if (docFreq >= skipInterval) { @@ -349,6 +379,7 @@ if (skipper == null) { // This DocsEnum has never done any skipping + //System.out.println(" init skipper"); skipper = new SepSkipListReader((IndexInput) skipIn.clone(), freqIn, docIn, @@ -358,6 +389,7 @@ } if (!skipped) { + //System.out.println(" init skipper2"); // We haven't yet skipped for this posting skipper.init(skipOffset, docIndex, @@ -374,14 +406,25 @@ final int newCount = skipper.skipTo(target); if (newCount > count) { - // Skipper did move if (!omitTF) { skipper.getFreqIndex().seek(freqReader); + freqUpto = freqReader.offset(); + freqLimit = freqReader.end(); + if (freqUpto >= freqLimit) { + freqLimit = freqReader.fill(); + } } skipper.getDocIndex().seek(docReader); + docDeltaUpto = docReader.offset(); + docDeltaLimit = docReader.end(); + if (docDeltaUpto >= docDeltaLimit) { + docDeltaLimit = docReader.fill(); + } + count = newCount; doc = skipper.getDoc(); + //System.out.println(" did move count=" + newCount + " doc=" + doc); } } @@ -401,91 +444,148 @@ int doc; int count; int freq; - long freqStart; private boolean storePayloads; private Bits skipDocs; - private final IntIndexInput.Reader docReader; - private final IntIndexInput.Reader freqReader; - private final IntIndexInput.Reader posReader; - private final IndexInput payloadIn; + private final BulkPostingsEnum.BlockReader docReader; + private final int[] docDeltaBuffer; + private int docDeltaUpto; + private int docDeltaLimit; + private final BulkPostingsEnum.BlockReader freqReader; + private final int[] freqBuffer; + private int freqUpto; + private int freqLimit; + private final BulkPostingsEnum.BlockReader posReader; + private final int[] posBuffer; + private int posUpto; + private int posLimit; private long skipOffset; + private long payloadOffset; + + private final IndexInput payloadIn; private final IntIndexInput.Index docIndex; private final IntIndexInput.Index freqIndex; private final IntIndexInput.Index posIndex; private final IntIndexInput startDocIn; - private long payloadOffset; - private int pendingPosCount; private int position; private int payloadLength; private long pendingPayloadBytes; - - private boolean skipped; - private SepSkipListReader skipper; private boolean payloadPending; private boolean posSeekPending; - SepDocsAndPositionsEnum() throws IOException { + boolean skipped; + SepSkipListReader skipper; + + public SepDocsAndPositionsEnum() throws IOException { startDocIn = docIn; docReader = docIn.reader(); + docDeltaBuffer = docReader.getBuffer(); docIndex = docIn.index(); freqReader = freqIn.reader(); + freqBuffer = freqReader.getBuffer(); freqIndex = freqIn.index(); posReader = posIn.reader(); + posBuffer = posReader.getBuffer(); posIndex = posIn.index(); payloadIn = (IndexInput) SepPostingsReaderImpl.this.payloadIn.clone(); } + // nocommit -- somehow we have to prevent re-decode of + // the same block if we have just .next()'d to next term + // in the terms dict -- this is an O(N^2) cost to eg + // TermRangeQuery when it steps through low freq terms!! SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException { this.skipDocs = skipDocs; + //System.out.println("sep d&p init"); + assert !fieldInfo.omitTermFreqAndPositions; storePayloads = fieldInfo.storePayloads; // TODO: can't we only do this if consumer // skipped consuming the previous docs? docIndex.set(termState.docIndex); + // nocommit -- verify, during merge, this seek is + // sometimes w/in block: docIndex.seek(docReader); + docDeltaLimit = docReader.end(); + docDeltaUpto = docReader.offset(); + if (docDeltaUpto >= docDeltaLimit) { + docDeltaLimit = docReader.fill(); + } freqIndex.read(docReader, true); freqIndex.seek(freqReader); + freqLimit = freqReader.end(); + freqUpto = freqReader.offset(); + if (freqUpto >= freqLimit) { + //System.out.println(" re-fill freqs freqMax=" + freqLimit); + freqLimit = freqReader.fill(); + } + //System.out.println(" freqIndex=" + freqIndex); posIndex.read(docReader, true); posSeekPending = true; payloadPending = false; payloadOffset = docReader.readVLong(); + //System.out.println(" payloadOffset=" + payloadOffset); skipOffset = docReader.readVLong(); + //System.out.println(" skipOffset=" + skipOffset); + + docDeltaLimit = docReader.end(); + docDeltaUpto = docReader.offset(); + /* + if (docDeltaUpto >= docDeltaLimit) { + // nocommit -- needed anymore? + docDeltaLimit = docReader.fill(); + docDeltaUpto = 0; + } + */ docFreq = termState.docFreq; + assert docFreq > 0; count = 0; doc = 0; pendingPosCount = 0; pendingPayloadBytes = 0; skipped = false; + //System.out.println(" docUpto=" + docDeltaUpto + " docMax=" + docDeltaLimit + " freqUpto=" + freqUpto + " freqMax=" + freqLimit); + return this; } + public boolean canReuse(IntIndexInput docsIn) { + return startDocIn == docsIn; + } + @Override public int nextDoc() throws IOException { - while(true) { if (count == docFreq) { return doc = NO_MORE_DOCS; } - count++; + if (docDeltaUpto == docDeltaLimit) { + // refill + docDeltaLimit = docReader.fill(); + docDeltaUpto = 0; + } - // TODO: maybe we should do the 1-bit trick for encoding - // freq=1 case? + count++; // Decode next doc - doc += docReader.next(); + doc += docDeltaBuffer[docDeltaUpto++]; - freq = freqReader.next(); + if (freqUpto == freqLimit) { + // refill + freqLimit = freqReader.fill(); + freqUpto = 0; + } + freq = freqBuffer[freqUpto++]; pendingPosCount += freq; if (skipDocs == null || !skipDocs.get(doc)) { @@ -525,6 +625,7 @@ docIn, posIn, maxSkipLevels, skipInterval); + } if (!skipped) { @@ -536,7 +637,6 @@ payloadOffset, docFreq, storePayloads); - skipped = true; } @@ -546,13 +646,24 @@ // Skipper did move skipper.getFreqIndex().seek(freqReader); + freqUpto = freqReader.offset(); + freqLimit = freqReader.end(); + if (freqUpto >= freqLimit) { + freqLimit = freqReader.fill(); + } + skipper.getDocIndex().seek(docReader); - //skipper.getPosIndex().seek(posReader); + docDeltaUpto = docReader.offset(); + docDeltaLimit = docReader.end(); + if (docDeltaUpto >= docDeltaLimit) { + docDeltaLimit = docReader.fill(); + } + posIndex.set(skipper.getPosIndex()); posSeekPending = true; count = newCount; doc = skipper.getDoc(); - //payloadIn.seek(skipper.getPayloadPointer()); + payloadOffset = skipper.getPayloadPointer(); pendingPosCount = 0; pendingPayloadBytes = 0; @@ -575,6 +686,11 @@ public int nextPosition() throws IOException { if (posSeekPending) { posIndex.seek(posReader); + posLimit = posReader.end(); + posUpto = posReader.offset(); + if (posUpto >= posLimit) { + posLimit = posReader.fill(); + } payloadIn.seek(payloadOffset); posSeekPending = false; } @@ -582,10 +698,12 @@ // scan over any docs that were iterated without their // positions while (pendingPosCount > freq) { - final int code = posReader.next(); + + final int code = nextPosInt(); + if (storePayloads && (code & 1) != 0) { // Payload length has changed - payloadLength = posReader.next(); + payloadLength = nextPosInt(); assert payloadLength >= 0; } pendingPosCount--; @@ -593,11 +711,12 @@ pendingPayloadBytes += payloadLength; } - final int code = posReader.next(); + final int code = nextPosInt(); + if (storePayloads) { if ((code & 1) != 0) { // Payload length has changed - payloadLength = posReader.next(); + payloadLength = nextPosInt(); assert payloadLength >= 0; } position += code >> 1; @@ -612,6 +731,14 @@ return position; } + private int nextPosInt() throws IOException { + if (posUpto == posLimit) { + posLimit = posReader.fill(); + posUpto = 0; + } + return posBuffer[posUpto++]; + } + private BytesRef payload; @Override @@ -645,4 +772,261 @@ return payloadPending && payloadLength > 0; } } + + class SepBulkPostingsEnum extends BulkPostingsEnum { + private int docFreq; + + private final BulkPostingsEnum.BlockReader docReader; + private final IntIndexInput.Index docIndex; + + private final BulkPostingsEnum.BlockReader freqReader; + private final IntIndexInput.Index freqIndex; + + private final BulkPostingsEnum.BlockReader posReader; + private final IntIndexInput.Index posIndex; + + private final boolean storePayloads; + private final boolean omitTF; + private long skipOffset; + + private final IntIndexInput startDocIn; + + private boolean skipped; + private SepSkipListReader skipper; + + public SepBulkPostingsEnum(FieldInfo fieldInfo, boolean doFreq, boolean doPos) throws IOException { + this.storePayloads = fieldInfo.storePayloads; + this.omitTF = fieldInfo.omitTermFreqAndPositions; + startDocIn = docIn; + docReader = docIn.reader(); + docIndex = docIn.index(); + + if (doFreq && !omitTF) { + freqReader = freqIn.reader(); + } else { + freqReader = null; + } + + if (doPos && !omitTF) { + if (storePayloads) { + // Must rewrite each posDelta: + posReader = new PosPayloadReader(posIn.reader()); + } else { + // Pass through + posReader = posIn.reader(); + } + } else { + posReader = null; + } + + if (!omitTF) { + // we have to pull these even if doFreq is false + // just so we can decode the index from the docs + // file + freqIndex = freqIn.index(); + posIndex = posIn.index(); + } else { + posIndex = null; + freqIndex = null; + } + } + + public boolean canReuse(FieldInfo fieldInfo, IntIndexInput docIn, boolean doFreq, boolean doPos) { + return fieldInfo.storePayloads == storePayloads && + startDocIn == docIn && + (freqReader != null || !doFreq) && + (posReader != null || !doPos); + } + + // nocommit -- make sure this is tested!! + + // Only used when payloads were stored -- we cannot do + // pass-through read for this since the payload lengths + // are also encoded into the position deltas + private final class PosPayloadReader extends BulkPostingsEnum.BlockReader { + final BulkPostingsEnum.BlockReader other; + private int pendingOffset; + private int limit; + private boolean skipNext; + + public PosPayloadReader(BulkPostingsEnum.BlockReader other) { + this.other = other; + } + + void doAfterSeek() {} + + @Override + public int[] getBuffer() { + return other.getBuffer(); + } + + // nocommit -- make sure this works correctly in the + // "reuse"/seek case + @Override + public int offset() { + pendingOffset = other.offset(); + return 0; + } + + @Override + public void setOffset(int offset) { + throw new UnsupportedOperationException(); + } + + @Override + public int fill() throws IOException { + // Translate code back to pos deltas, and filter out + // any changes in payload length. NOTE: this is a + // perf hit on indices that encode payloads, even if + // they use "normal" positional queries + final int otherLimit = other.fill(); + limit = 0; + final int[] buffer = other.getBuffer(); + for(int i=pendingOffset;i>> 1; + if ((code & 1) != 0) { + // skip the payload length + skipNext = true; + } + } + } + pendingOffset = 0; + + return limit; + } + + @Override + public int end() { + return limit; + } + } + + /** Position readers to the specified term */ + SepBulkPostingsEnum init(SepTermState termState) throws IOException { + + // nocommit -- make sure seek w/in buffer is efficient + // here: + + // TODO: can't we only do this if consumer + // skipped consuming the previous docs? + docIndex.set(termState.docIndex); + docIndex.seek(docReader); + //System.out.println("sep init offset=" + docReader.offset() + " limit=" + docReader.end() + " omitTF=" + omitTF); + //System.out.println(" v[0]=" + docReader.getBuffer()[0]); + + if (!omitTF) { + freqIndex.read(docReader, true); + if (freqReader != null) { + freqIndex.seek(freqReader); + } + posIndex.read(docReader, true); + // skip payload offset -- nocommit only store this + // if field has payloads + docReader.readVLong(); + } + + skipOffset = docReader.readVLong(); + //System.out.println("skipOffset=" + skipOffset); + + if (posReader != null) { + if (storePayloads) { + PosPayloadReader posPayloadReader = (PosPayloadReader) posReader; + posIndex.seek(posPayloadReader.other); + posPayloadReader.doAfterSeek(); + } else { + posIndex.seek(posReader); + } + } + + if (docReader.offset() >= docReader.end()) { + docReader.fill(); + docReader.setOffset(0); + } + + docFreq = termState.docFreq; + skipped = false; + + return this; + } + + @Override + public BulkPostingsEnum.BlockReader getDocDeltasReader() { + // Maximize perf -- just pass through the underlying + // intblock reader: + return docReader; + } + + @Override + public BulkPostingsEnum.BlockReader getFreqsReader() { + // Maximize perf -- just pass through the underlying + // intblock reader: + return freqReader; + } + + @Override + public BulkPostingsEnum.BlockReader getPositionDeltasReader() { + // Maximize perf -- just pass through the underlying + // intblock reader (if payloads were not indexed): + return posReader; + } + + private final JumpResult jumpResult = new JumpResult(); + + @Override + public JumpResult jump(int target, int curCount) throws IOException { + + if (docFreq >= skipInterval) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This enum has never done any skipping + skipper = new SepSkipListReader((IndexInput) skipIn.clone(), + freqIn, + docIn, + posIn, + maxSkipLevels, skipInterval); + } + + if (!skipped) { + // We haven't yet skipped for this particular posting + skipper.init(skipOffset, + docIndex, + freqIndex, + posIndex, + 0, + docFreq, + storePayloads); + skipper.setOmitTF(omitTF); + skipped = true; + } + + final int newCount = skipper.skipTo(target); + //System.out.println(" sep skip newCount=" + newCount + " vs count=" + curCount); + + if (newCount > curCount) { + + // Skipper did move -- seek all readers: + skipper.getDocIndex().seek(docReader); + + if (freqReader != null) { + skipper.getFreqIndex().seek(freqReader); + } + if (posReader != null) { + skipper.getPosIndex().seek(posReader); + } + + jumpResult.count = newCount; + jumpResult.docID = skipper.getDoc(); + return jumpResult; + } + } + return null; + } + } } Index: src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java =================================================================== --- src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (working copy) @@ -181,6 +181,7 @@ posIndex.write(docOut, true); docOut.writeVLong(payloadStart); } + // nocommit -- only write if docFreq > skipInterval? docOut.writeVLong(skipOut.getFilePointer()); firstDoc = false; } @@ -199,6 +200,7 @@ } lastDocID = docID; + //System.out.println("sepw: write docID=" + docID); docOut.write(delta); if (!omitTF) { freqOut.write(termDocFreq); Index: src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java =================================================================== --- src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java (working copy) @@ -108,10 +108,10 @@ for(int i=0;i getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @@ -508,6 +520,189 @@ } } + private class SimpleTextBulkPostingsEnum extends BulkPostingsEnum { + private final IndexInput inStart; + private final IndexInput in; + private final LineCountReader docDeltasReader; + private final FreqsReader freqsReader; + private final LineCountReader positionDeltasReader; + + public SimpleTextBulkPostingsEnum(boolean doFreq, boolean doPositions) { + this.inStart = SimpleTextFieldsReader.this.in; + this.in = (IndexInput) this.inStart.clone(); + docDeltasReader = new LineCountReader(DOC); + if (doFreq) { + freqsReader = new FreqsReader(); + } else { + freqsReader = null; + } + + if (doPositions) { + positionDeltasReader = new LineCountReader(POS); + } else { + positionDeltasReader = null; + } + } + + public boolean canReuse(IndexInput in, boolean doFreq, boolean doPositions) { + return in == inStart && (doFreq == (freqsReader != null)) && (doPositions == (positionDeltasReader != null)); + } + + // reads docDeltas & positionDeltas + private class LineCountReader extends BlockReader { + private final BytesRef prefix; + private final int[] buffer = new int[64]; + private final IndexInput in; + private final BytesRef scratch = new BytesRef(10); + private int lastValue; + private int limit; + + public LineCountReader(BytesRef prefix) { + this.prefix = prefix; + this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); + } + + public void reset(long fp) throws IOException { + lastValue = 0; + in.seek(fp); + fill(); + } + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + assert offset == 0; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + int upto = 0; + while(upto < buffer.length) { + readLine(in, scratch); + if (scratch.startsWith(TERM) || scratch.startsWith(FIELD) || scratch.equals(END)) { + break; + } else if (scratch.startsWith(prefix)) { + final int value = Integer.parseInt(new String(scratch.bytes, scratch.offset+prefix.length, scratch.length-prefix.length)); + buffer[upto++] = value - lastValue; + lastValue = value; + } + } + return limit = upto; + } + } + + private class FreqsReader extends BlockReader { + private final int[] buffer = new int[64]; + private final IndexInput in; + private final BytesRef scratch = new BytesRef(10); + private int limit; + private boolean omitTF; + + public FreqsReader() { + this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); + } + + public void reset(long fp, boolean omitTF) throws IOException { + in.seek(fp); + this.omitTF = omitTF; + fill(); + } + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + assert offset == 0; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + int upto = 0; + int freq = -1; + long lastFP = in.getFilePointer(); + while(upto < buffer.length) { + lastFP = in.getFilePointer(); + readLine(in, scratch); + if (scratch.startsWith(TERM) || scratch.startsWith(FIELD) || scratch.equals(END)) { + if (freq != -1) { + buffer[upto++] = omitTF ? 1 : freq; + } + break; + } else if (scratch.startsWith(DOC)) { + if (freq != -1) { + buffer[upto++] = omitTF ? 1: freq; + } + freq = 0; + } else if (scratch.startsWith(POS)) { + freq++; + } + } + in.seek(lastFP); + return limit = upto; + } + } + + public SimpleTextBulkPostingsEnum reset(long fp, boolean omitTF) throws IOException { + + docDeltasReader.reset(fp); + + if (freqsReader != null) { + freqsReader.reset(fp, omitTF); + } + if (positionDeltasReader != null) { + positionDeltasReader.reset(fp); + } + return this; + } + + @Override + public BlockReader getDocDeltasReader() { + return docDeltasReader; + } + + @Override + public BlockReader getPositionDeltasReader() { + return positionDeltasReader; + } + + @Override + public BlockReader getFreqsReader() { + return freqsReader; + } + + @Override + public JumpResult jump(int target, int curCount) { + return null; + } + } + private class SimpleTextTerms extends Terms { private final String field; private final long termsStart; Index: src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java =================================================================== --- src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.codecs.PostingsReaderBase; @@ -171,6 +172,17 @@ } @Override + public BulkPostingsEnum bulkPostings(FieldInfo fieldInfo, TermState termState, BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + SegmentBulkPostingsEnum postingsEnum; + if (reuse == null || !(reuse instanceof SegmentBulkPostingsEnum) || !((SegmentBulkPostingsEnum) reuse).canReuse(fieldInfo, freqIn, doFreqs, doPositions)) { + postingsEnum = new SegmentBulkPostingsEnum(fieldInfo.omitTermFreqAndPositions, doFreqs, doPositions); + } else { + postingsEnum = (SegmentBulkPostingsEnum) reuse; + } + return postingsEnum.reset(fieldInfo, (DocTermState) termState); + } + + @Override public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { if (fieldInfo.omitTermFreqAndPositions) { return null; @@ -249,6 +261,7 @@ // cases freqIn.seek(termState.freqOffset); limit = termState.docFreq; + assert limit > 0; ord = 0; doc = 0; @@ -283,7 +296,6 @@ break; } } - return doc; } @@ -359,13 +371,14 @@ } final int newOrd = skipper.skipTo(target); - + //System.out.println(" newOrd=" + newOrd + " vs ord=" + ord); + if (newOrd > ord) { // Skipper moved - ord = newOrd; doc = skipper.getDoc(); freqIn.seek(skipper.getFreqPointer()); + //System.out.println(" go lastDoc=" + doc); } } @@ -421,6 +434,7 @@ lazyProxPointer = termState.proxOffset; limit = termState.docFreq; + assert limit > 0; ord = 0; doc = 0; position = 0; @@ -797,4 +811,328 @@ return payloadPending && payloadLength > 0; } } + + static final int BULK_BUFFER_SIZE = 64; + + // Bulk postings API + private final class SegmentBulkPostingsEnum extends BulkPostingsEnum { + private final IndexInput freqIn; + private final IndexInput proxIn; + + final IndexInput startFreqIn; + private final boolean omitTF; + + boolean storePayloads; // does current field store payloads? + + int ord; // how many docs we've read + int docFreq; + + long freqOffset; + long proxOffset; + int skipOffset; + + boolean skipped; + DefaultSkipListReader skipper; + private int payloadLength; + + private final DocDeltasReader docDeltasReader; + private final FreqsReader freqsReader; + private final PositionsReader positionDeltasReader; + + private boolean docsPending, freqsPending; + + public SegmentBulkPostingsEnum(boolean omitTF, boolean doFreqs, boolean doPositions) throws IOException { + //System.out.println("bulk init"); + startFreqIn = StandardPostingsReader.this.freqIn; + this.freqIn = (IndexInput) StandardPostingsReader.this.freqIn.clone(); + this.omitTF = omitTF; + + docDeltasReader = new DocDeltasReader(); + if (doFreqs && !omitTF) { + freqsReader = new FreqsReader(); + } else { + freqsReader = null; + } + + if (doPositions && !omitTF) { + this.proxIn = (IndexInput) StandardPostingsReader.this.proxIn.clone(); + positionDeltasReader = new PositionsReader(); + } else { + this.proxIn = null; + positionDeltasReader = null; + } + } + + public boolean canReuse(FieldInfo fieldInfo, IndexInput freqin, boolean doFreqs, boolean doPositions) { + return freqIn == startFreqIn && + (!doFreqs || freqsReader == null) && + (!doPositions || positionDeltasReader == null) && + (omitTF == fieldInfo.omitTermFreqAndPositions); + } + + final void read() throws IOException { + if (freqsReader == null) { + // Consumer only wants doc deltas + assert !docsPending; + if (omitTF) { + // Index only stores doc deltas + for(int i=0;i>> 1; + if ((code & 1) == 0) { + freqIn.readVInt(); + } + } + } + ord += BULK_BUFFER_SIZE; + docsPending = true; + } else { + // Consumer wants both + assert !docsPending; + assert !freqsPending; + for(int i=0;i>> 1; + if ((code & 1) == 0) { + freqsReader.buffer[i] = freqIn.readVInt(); + } else { + freqsReader.buffer[i] = 1; + } + } + ord += BULK_BUFFER_SIZE; + docsPending = true; + freqsPending = true; + } + } + + private class DocDeltasReader extends BulkPostingsEnum.BlockReader { + final int[] buffer = new int[BULK_BUFFER_SIZE]; + int limit; + int offset; + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + if (!docsPending) { + read(); + } + docsPending = false; + limit = BULK_BUFFER_SIZE; + offset = 0; + //System.out.println("spr: doc deltas read limit=" + limit); + return BULK_BUFFER_SIZE; + } + + @Override + public int offset() { + return offset; + } + + @Override + public void setOffset(int offset) { + this.offset = offset; + } + } + + private class FreqsReader extends BulkPostingsEnum.BlockReader { + final int[] buffer = new int[BULK_BUFFER_SIZE]; + int limit; + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + if (!freqsPending) { + read(); + } + freqsPending = false; + limit = BULK_BUFFER_SIZE; + return BULK_BUFFER_SIZE; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + throw new UnsupportedOperationException(); + } + } + + private class PositionsReader extends BulkPostingsEnum.BlockReader { + final int[] buffer = new int[BULK_BUFFER_SIZE]; + int limit; + + @Override + public int[] getBuffer() { + return buffer; + } + + @Override + public int end() { + return limit; + } + + @Override + public int fill() throws IOException { + // nocommit -- must flush prx file w/ extra 127 0 + // positions -- index change!! + if (storePayloads) { + for(int i=0;i>> 1; + if ((code & 1) != 0) { + payloadLength = proxIn.readVInt(); + } + if (payloadLength != 0) { + // skip payload + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + } + } else { + for(int i=0;i 0; + + ord = 0; + skipped = false; + + return this; + } + + private final JumpResult jumpResult = new JumpResult(); + + @Override + public JumpResult jump(int target, int curCount) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped since reset() was called, so now we + // load the skip data for this posting + skipper.init(freqOffset + skipOffset, + freqOffset, proxOffset, + docFreq, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + // nocommit rename ord -> count + assert curCount == ord: "curCount=" + curCount + " ord=" + ord; + + if (newOrd > ord) { + // Skipper moved + //System.out.println("newOrd=" + newOrd + " vs ord=" + ord + " doc=" + skipper.getDoc()); + + freqIn.seek(skipper.getFreqPointer()); + docDeltasReader.limit = 0; + + if (freqsReader != null) { + freqsReader.limit = 0; + } + + if (positionDeltasReader != null) { + positionDeltasReader.limit = 0; + proxIn.seek(skipper.getProxPointer()); + } + + jumpResult.count = ord = newOrd; + jumpResult.docID = skipper.getDoc(); + + return jumpResult; + } + } + + // no jump occurred + return null; + } + } } Index: src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java =================================================================== --- src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (revision 1044119) +++ src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (working copy) @@ -224,6 +224,16 @@ @Override public void close() throws IOException { + + // Readers read whole blocks at once, so we have to + // flush final block out w/ unused values: + for(int i=0;i= minNrShouldMatch) { // TODO: re-enable this if BQ ever sends us required clauses - // (current.bits & requiredMask) == requiredMask && + // (current.bits & requiredMask) == requiredMask + // && + //System.out.println(" BS.nextDoc return doc=" + current.doc); return doc = current.doc; } } @@ -327,6 +342,7 @@ } } while (bucketTable.first != null || more); + //System.out.println(" bs done nextDoc"); return doc = NO_MORE_DOCS; } @@ -351,7 +367,7 @@ buffer.append(")"); return buffer.toString(); } - + @Override protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { super.visitSubScorers(parent, relationship, visitor); Index: src/java/org/apache/lucene/search/ConstantScoreQuery.java =================================================================== --- src/java/org/apache/lucene/search/ConstantScoreQuery.java (revision 1044119) +++ src/java/org/apache/lucene/search/ConstantScoreQuery.java (working copy) @@ -142,6 +142,11 @@ public int nextDoc() throws IOException { return docIdSetIterator.nextDoc(); } + + @Override + public String toString() { + return "ConstantScorer(" + filter + ")"; + } @Override public int docID() { Index: src/java/org/apache/lucene/search/FieldCacheRangeFilter.java =================================================================== --- src/java/org/apache/lucene/search/FieldCacheRangeFilter.java (revision 1044119) +++ src/java/org/apache/lucene/search/FieldCacheRangeFilter.java (working copy) @@ -128,6 +128,7 @@ @Override final boolean matchDoc(int doc) { final int docOrd = fcsi.getOrd(doc); + //System.out.println(" doc=" + doc + " matches?=" + (docOrd >= inclusiveLowerPoint && docOrd <= inclusiveUpperPoint)); return docOrd >= inclusiveLowerPoint && docOrd <= inclusiveUpperPoint; } }; Index: src/java/org/apache/lucene/search/FilteredTermsEnum.java =================================================================== --- src/java/org/apache/lucene/search/FilteredTermsEnum.java (revision 1044119) +++ src/java/org/apache/lucene/search/FilteredTermsEnum.java (working copy) @@ -23,6 +23,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; @@ -150,6 +151,12 @@ public DocsEnum docs(Bits bits, DocsEnum reuse) throws IOException { return tenum.docs(bits, reuse); } + + @Override + public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + assert tenum != null; + return tenum.bulkPostings(reuse, doFreqs, doPositions); + } @Override public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) throws IOException { Index: src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- src/java/org/apache/lucene/search/IndexSearcher.java (revision 1044119) +++ src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -226,6 +226,7 @@ final Filter filter, final Collector collector) throws IOException { assert filter != null; + //System.out.println("is.searchWithFilter"); Scorer scorer = weight.scorer(reader, true, false); if (scorer == null) { @@ -252,6 +253,7 @@ collector.setScorer(scorer); while (true) { + //System.out.println(" cycle sDoc=" + scorerDoc + " fDoc=" + filterDoc); if (scorerDoc == filterDoc) { // Check if scorer has exhausted, only before collecting. if (scorerDoc == DocIdSetIterator.NO_MORE_DOCS) { Index: src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (revision 1044119) +++ src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (working copy) @@ -23,7 +23,7 @@ import org.apache.lucene.index.Fields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.Bits; @@ -120,31 +120,43 @@ final TermsEnum termsEnum = query.getTermsEnum(terms); assert termsEnum != null; + //System.out.println("\nmtqwf.getDocIdSet r=" + reader); if (termsEnum.next() != null) { // fill into a OpenBitSet final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); int termCount = 0; final Bits delDocs = MultiFields.getDeletedDocs(reader); - DocsEnum docsEnum = null; + BulkPostingsEnum postingsEnum = null; do { termCount++; - // System.out.println(" iter termCount=" + termCount + " term=" + - // enumerator.term().toBytesString()); - docsEnum = termsEnum.docs(delDocs, docsEnum); - final DocsEnum.BulkReadResult result = docsEnum.getBulkResult(); - while (true) { - final int count = docsEnum.read(); - if (count != 0) { - final int[] docs = result.docs.ints; - for (int i = 0; i < count; i++) { - bitSet.set(docs[i]); - } - } else { - break; + postingsEnum = termsEnum.bulkPostings(postingsEnum, false, false); + final int docFreq = termsEnum.docFreq(); + //System.out.println(" iter termCount=" + termCount + " term=" + termsEnum.term().utf8ToString() + " df=" + docFreq); + final BulkPostingsEnum.BlockReader docDeltasReader = postingsEnum.getDocDeltasReader(); + final int[] docDeltas = docDeltasReader.getBuffer(); + int offset = docDeltasReader.offset(); + int limit = docDeltasReader.end(); + if (offset >= limit) { + limit = docDeltasReader.fill(); + } + //System.out.println(" start offset=" + offset + " limit=" + limit); + int count = 0; + int doc = 0; + while (count < docFreq) { + if (offset >= limit) { + offset = 0; + limit = docDeltasReader.fill(); + //System.out.println(" fill limit=" + limit); + } + doc += docDeltas[offset++]; + count++; + if (delDocs == null || !delDocs.get(doc)) { + bitSet.set(doc); } } + //System.out.println(" end offset=" + offset); } while (termsEnum.next() != null); - // System.out.println(" done termCount=" + termCount); + //System.out.println(" done termCount=" + termCount); query.incTotalNumberOfTerms(termCount); return bitSet; Index: src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- src/java/org/apache/lucene/search/TermQuery.java (revision 1044119) +++ src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -21,6 +21,7 @@ import java.util.Set; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.BulkPostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation.IDFExplanation; @@ -76,15 +77,26 @@ @Override public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { - DocsEnum docs = reader.termDocsEnum(reader.getDeletedDocs(), - term.field(), - term.bytes()); - + assert reader.getSequentialSubReaders() == null; + //System.out.println("TQ: make bulk postings"); + BulkPostingsEnum docs = reader.bulkTermPostingsEnum(term.field(), + term.bytes(), + true, + false); + //System.out.println("bulk enum " + docs + " reader=" + reader); if (docs == null) { return null; } - return new TermScorer(this, docs, similarity, reader.norms(term.field())); + //System.out.println("R=" + reader + " df=" + reader.docFreq(term.field(), term.bytes())); + + TermScorer ts = new TermScorer(this, docs, + reader.docFreq(term.field(), term.bytes()), + reader.getDeletedDocs(), similarity, reader.norms(term.field())); + // nocommit + ts.term = term; + ts.maxDoc = reader.maxDoc(); + return ts; } @Override @@ -124,10 +136,10 @@ int tf = 0; DocsEnum docs = reader.termDocsEnum(reader.getDeletedDocs(), term.field(), term.bytes()); if (docs != null) { - int newDoc = docs.advance(doc); - if (newDoc == doc) { - tf = docs.freq(); - } + int newDoc = docs.advance(doc); + if (newDoc == doc) { + tf = docs.freq(); + } tfExplanation.setValue(similarity.tf(tf)); tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); } else { Index: src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- src/java/org/apache/lucene/search/TermScorer.java (revision 1044119) +++ src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -19,26 +19,38 @@ import java.io.IOException; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.index.BulkPostingsEnum; +import org.apache.lucene.util.Bits; +import org.apache.lucene.index.Term; /** Expert: A Scorer for documents matching a Term. */ final class TermScorer extends Scorer { - private DocsEnum docsEnum; + private BulkPostingsEnum docsEnum; private byte[] norms; private float weightValue; - private int doc = -1; - private int freq; + private int doc; + + private final int[] docDeltas; + private int docPointer; + private int docPointerMax; + private boolean first = true; - private int pointer; - private int pointerMax; + private final int[] freqs; + private int freqPointer; + private int freqPointerMax; private static final int SCORE_CACHE_SIZE = 32; private float[] scoreCache = new float[SCORE_CACHE_SIZE]; - private int[] docs; - private int[] freqs; - private final DocsEnum.BulkReadResult bulkResult; + private final BulkPostingsEnum.BlockReader freqsReader; + private final BulkPostingsEnum.BlockReader docDeltasReader; + private final Bits skipDocs; + private final int docFreq; + private int count; + + // nocommit + public Term term; + public int maxDoc; /** * Construct a TermScorer. @@ -53,13 +65,38 @@ * @param norms * The field norms of the document fields for the Term. */ - TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { + TermScorer(Weight weight, BulkPostingsEnum td, int docFreq, Bits skipDocs, Similarity similarity, byte[] norms) throws IOException { super(similarity, weight); this.docsEnum = td; + this.docFreq = docFreq; + docDeltasReader = td.getDocDeltasReader(); + docDeltas = docDeltasReader.getBuffer(); + docPointerMax = docDeltasReader.end(); + docPointer = docDeltasReader.offset(); + if (docPointer >= docPointerMax) { + docPointerMax = docDeltasReader.fill(); + } + docPointer--; + + freqsReader = td.getFreqsReader(); + if (freqsReader != null) { + freqs = freqsReader.getBuffer(); + freqPointerMax = freqsReader.end(); + freqPointer = freqsReader.offset(); + if (freqPointer >= freqPointerMax) { + freqPointerMax = freqsReader.fill(); + } + freqPointer--; + } else { + freqs = null; + } + + //System.out.println("make new TS dp=" + docPointer + " dpMax=" + docPointerMax + " td=" + td + " freqP=" + freqPointer + " freqPMax=" + freqPointerMax + " this=" + this); + + this.skipDocs = skipDocs; this.norms = norms; this.weightValue = weight.getValue(); - bulkResult = td.getBulkResult(); for (int i = 0; i < SCORE_CACHE_SIZE; i++) scoreCache[i] = getSimilarity().tf(i) * weightValue; @@ -70,41 +107,74 @@ score(c, Integer.MAX_VALUE, nextDoc()); } - private final void refillBuffer() throws IOException { - pointerMax = docsEnum.read(); // refill - docs = bulkResult.docs.ints; - freqs = bulkResult.freqs.ints; - } - // firstDocID is ignored since nextDoc() sets 'doc' @Override protected boolean score(Collector c, int end, int firstDocID) throws IOException { c.setScorer(this); + //System.out.println("ts.collect firstdocID=" + firstDocID + " term=" + term + " end=" + end + " doc=" + doc); + // nocommit -- this can leave scorer on a deleted doc... while (doc < end) { // for docs in window - c.collect(doc); // collect score - if (++pointer >= pointerMax) { - refillBuffer(); - if (pointerMax != 0) { - pointer = 0; - } else { - doc = NO_MORE_DOCS; // set to sentinel value - return false; + if (skipDocs == null || !skipDocs.get(doc)) { + //System.out.println("ts.collect doc=" + doc + " skipDocs=" + skipDocs + " count=" + count + " vs dF=" + docFreq); + c.collect(doc); // collect + } + if (count == docFreq) { + doc = NO_MORE_DOCS; + return false; + } + count++; + docPointer++; + + //System.out.println("dp=" + docPointer + " dpMax=" + docPointerMax + " count=" + count + " countMax=" + docFreq); + + if (docPointer >= docPointerMax) { + docPointerMax = docDeltasReader.fill(); + //System.out.println(" refill! dpMax=" + docPointerMax + " reader=" + docDeltasReader); + assert docPointerMax != 0; + docPointer = 0; + + if (freqsReader != null) { + freqPointer++; + // NOTE: this code is intentionally dup'd + // (specialized) w/ the else clause, for better CPU + // branch prediction (assuming compiler doesn't + // de-dup): for codecs that always bulk read same + // number of docDeltas & freqs (standard, for, + // pfor), this if will always be true. Other codecs + // (simple9/16) will not be aligned: + if (freqPointer >= freqPointerMax) { + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } } - } - doc = docs[pointer]; - freq = freqs[pointer]; + } else if (freqsReader != null) { + freqPointer++; + if (freqPointer >= freqPointerMax) { + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } + } + + doc += docDeltas[docPointer]; + assert doc < maxDoc: "doc=" + doc + " maxDoc=" + maxDoc; } return true; } @Override public int docID() { - return doc; + return first ? -1 : doc; } @Override public float freq() { - return freq; + if (freqsReader != null) { + return freqs[freqPointer]; + } else { + return 1.0f; + } } /** @@ -116,30 +186,76 @@ */ @Override public int nextDoc() throws IOException { - pointer++; - if (pointer >= pointerMax) { - refillBuffer(); - if (pointerMax != 0) { - pointer = 0; + //System.out.println("ts.nextDoc " + this + " count=" + count + " vs docFreq=" + docFreq); + while(count < docFreq) { + docPointer++; + if (docPointer >= docPointerMax) { + //System.out.println("ts.nd refill docs"); + docPointerMax = docDeltasReader.fill(); + assert docPointerMax != 0; + docPointer = 0; + if (freqsReader != null) { + // NOTE: this code is intentionally dup'd + // (specialized) w/ the else clause, for better CPU + // branch prediction (assuming compiler doesn't + // de-dup): for codecs that always bulk read same + // number of docDeltas & freqs (standard, for, + // pfor), this if will always be true. Other codecs + // (simple9/16) will not be aligned: + freqPointer++; + if (freqPointer >= freqPointerMax) { + //System.out.println("ts.nd refill freqs"); + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } + } } else { - return doc = NO_MORE_DOCS; + if (freqsReader != null) { + freqPointer++; + if (freqPointer >= freqPointerMax) { + //System.out.println("ts.nd refill freqs"); + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } + } } - } - doc = docs[pointer]; - freq = freqs[pointer]; - assert doc != NO_MORE_DOCS; - return doc; + count++; + doc += docDeltas[docPointer]; + assert doc < maxDoc; + first = false; + assert doc >= 0 && (skipDocs == null || doc < skipDocs.length()) && doc != NO_MORE_DOCS: "doc=" + doc + " skipDocs=" + skipDocs + " skipDocs.length=" + (skipDocs==null? "n/a" : skipDocs.length()); + if (skipDocs == null || !skipDocs.get(doc)) { + //System.out.println(" ret doc=" + doc + " freq=" + freq()); + return doc; + } + } + + //System.out.println(" end"); + return doc = NO_MORE_DOCS; } @Override public float score() { + assert !first; + final int freq; + if (freqsReader == null) { + freq = 1; + } else { + freq = freqs[freqPointer]; + } + assert freq > 0; assert doc != NO_MORE_DOCS; float raw = // compute tf(f)*weight freq < SCORE_CACHE_SIZE // check cache ? scoreCache[freq] // cache hit : getSimilarity().tf(freq)*weightValue; // cache miss - return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize for field + // nocommit + float v = norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize for field + //System.out.println("TS.score " + this + ": v=" + v + " freq=" + freq); + return v; } /** @@ -153,28 +269,120 @@ */ @Override public int advance(int target) throws IOException { - // first scan in cache - for (pointer++; pointer < pointerMax; pointer++) { - if (docs[pointer] >= target) { - freq = freqs[pointer]; - return doc = docs[pointer]; + + // nocommit: should we, here, optimize .advance(target that isn't + // too far away) into scan? seems like simple win? + + //System.out.println("ts.advance " + this + " target=" + target + " ct=" + count + " vs df=" + docFreq + " dp=" + docPointer + " dpMax=" + docPointerMax + " id=" + System.identityHashCode(this) + " first=" + first); + + // first scan current doc deltas block + for (docPointer++; docPointer < docPointerMax && count < docFreq; docPointer++) { + assert first || docDeltas[docPointer] > 0; + doc += docDeltas[docPointer]; + assert doc < maxDoc; + first = false; + //System.out.println(" scan doc=" + doc); + count++; + if (freqsReader != null && ++freqPointer >= freqPointerMax) { + //System.out.println(" refill freqs"); + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } + if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) { + //System.out.println(" ret0 doc=" + doc + " count=" + count + " freq=" + freqs[freqPointer]); + return doc; } } - // not found in readahead cache, seek underlying stream - int newDoc = docsEnum.advance(target); - //System.out.println("ts.advance docsEnum=" + docsEnum); - if (newDoc != NO_MORE_DOCS) { - doc = newDoc; - freq = docsEnum.freq(); + if (count == docFreq) { + return doc = NO_MORE_DOCS; + } + + // not found in current block, seek underlying stream + BulkPostingsEnum.JumpResult jumpResult = docsEnum.jump(target, count); + //System.out.println(" jumpResult=" + jumpResult); + if (jumpResult != null) { + //System.out.println(" jump count=" + jumpResult.count + " jump docID=" + jumpResult.docID); + count = jumpResult.count; + doc = jumpResult.docID; + first = false; + docPointer = docDeltasReader.offset(); + docPointerMax = docDeltasReader.end(); + if (docPointer >= docPointerMax) { + docPointerMax = docDeltasReader.fill(); + //System.out.println(" re-fill docs dpMax=" + docPointerMax + " dd[0]=" + docDeltas[0] + " dp=" + docPointer); + } + docPointer--; + if (freqsReader != null) { + freqPointer = freqsReader.offset(); + freqPointerMax = freqsReader.end(); + if (freqPointer >= freqPointerMax) { + freqPointerMax = freqsReader.fill(); + } + freqPointer--; + } + //System.out.println(" count=" + count + " docMax=" + docPointerMax + " freqMax=" + freqPointerMax + " doc=" + doc); } else { - doc = NO_MORE_DOCS; + // seek did not jump -- just fill next buffer + docPointerMax = docDeltasReader.fill(); + //System.out.println(" fill docDeltas max=" + docPointerMax); + if (docPointerMax != 0) { + docPointer = 0; + assert first || docDeltas[0] > 0; + doc += docDeltas[0]; + assert doc < maxDoc: "doc=" + doc + " maxDoc=" + maxDoc; + count++; + first = false; + //System.out.println(" doc=" + doc + " dd[0]=" + docDeltas[0]); + } else { + return doc = NO_MORE_DOCS; + } + if (freqsReader != null && ++freqPointer >= freqPointerMax) { + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } + } + //System.out.println(" ts now scan start doc=" + doc); + + // now scan + while(true) { + assert doc >= 0 && doc != NO_MORE_DOCS; + if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) { + //System.out.println(" ret doc=" + doc + " count=" + count); + return doc; + } + + if (count >= docFreq) { + break; + } + + if (++docPointer >= docPointerMax) { + docPointerMax = docDeltasReader.fill(); + if (docPointerMax != 0) { + docPointer = 0; + } else { + return doc = NO_MORE_DOCS; + } + } + + if (freqsReader != null && ++freqPointer >= freqPointerMax) { + freqPointerMax = freqsReader.fill(); + assert freqPointerMax != 0; + freqPointer = 0; + } + + assert first || docDeltas[docPointer] > 0; + doc += docDeltas[docPointer]; + assert doc < maxDoc; + count++; } - return doc; + //System.out.println(" fallout END"); + return doc = NO_MORE_DOCS; } /** Returns a string representation of this TermScorer. */ @Override public String toString() { return "scorer(" + weight + ")"; } - } Index: src/java/org/apache/lucene/store/MMapDirectory.java =================================================================== --- src/java/org/apache/lucene/store/MMapDirectory.java (revision 1044119) +++ src/java/org/apache/lucene/store/MMapDirectory.java (working copy) @@ -214,7 +214,8 @@ } } - private class MMapIndexInput extends IndexInput { + // nocommit was private + public class MMapIndexInput extends IndexInput { private ByteBuffer buffer; private final long length; @@ -225,6 +226,11 @@ this.buffer = raf.getChannel().map(MapMode.READ_ONLY, 0, length); } + // nocommit + public ByteBuffer getBuffer() { + return buffer; + } + @Override public byte readByte() throws IOException { try { @@ -283,7 +289,8 @@ // Because Java's ByteBuffer uses an int to address the // values, it's necessary to access a file > // Integer.MAX_VALUE in size using multiple byte buffers. - private class MultiMMapIndexInput extends IndexInput { + // nocommit was private + public class MultiMMapIndexInput extends IndexInput { private ByteBuffer[] buffers; private int[] bufSizes; // keep here, ByteBuffer.size() method is optional @@ -331,6 +338,11 @@ seek(0L); } + public ByteBuffer getBuffer() { + // nocommit fixup + return null; + } + @Override public byte readByte() throws IOException { // Performance might be improved by reading ahead into an array of Index: src/java/org/apache/lucene/util/BitUtil.java =================================================================== --- src/java/org/apache/lucene/util/BitUtil.java (revision 1044119) +++ src/java/org/apache/lucene/util/BitUtil.java (working copy) @@ -814,4 +814,25 @@ return v; } + /** Returns the smallest non negative p such that a given value < (2**(p+1)) + * This differs from (63 - java.lang.Long.numberOfLeadingZeros(v)) + * for non positive given values. + */ + public static int logNextHigherPowerOfTwo(long v) { + long vinput = v; // only for assertions below. + int p = 0; + while (v >= (1 << 8)) { + v >>= 8; + p += 8; + } + while (v >= (1 << 1)) { + v >>= 1; + p++; + } + assert (p <= 62) : p; + assert (p == 62) || (vinput < (1L << (p + 1))) : "p " + p + ", vinput " + vinput; + assert (p == 0) || (vinput >= (1L << p)) : "p " + p + ", vinput " + vinput; + assert (vinput <= 0) || (p == (63 - java.lang.Long.numberOfLeadingZeros(vinput))) : "p " + p + ", vinput " + vinput; + return p; + } } Index: src/test/org/apache/lucene/TestDemo.java =================================================================== --- src/test/org/apache/lucene/TestDemo.java (revision 1044119) +++ src/test/org/apache/lucene/TestDemo.java (working copy) @@ -61,9 +61,10 @@ // Now search the index: IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true + QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "fieldname", analyzer); + assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits); // Parse a simple query that searches for "text": - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "fieldname", analyzer); Query query = parser.parse("text"); TopDocs hits = isearcher.search(query, null, 1); assertEquals(1, hits.totalHits); Index: src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- src/test/org/apache/lucene/TestExternalCodecs.java (revision 1044119) +++ src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -342,6 +342,114 @@ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) { return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), skipDocs); } + + @Override + public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException { + return new RAMBulkPostingsEnum(ramField.termToDocs.get(current)); + } + } + + static final int BULK_BUFFER_SIZE = 64; + + // Bulk postings API + private static class RAMBulkPostingsEnum extends BulkPostingsEnum { + private final RAMTerm ramTerm; + private final BlockReader docDeltasReader; + private final BlockReader freqsReader; + private final BlockReader posDeltasReader; + + public RAMBulkPostingsEnum(RAMTerm ramTerm) throws IOException { + this.ramTerm = ramTerm; + + int[] docDeltas = new int[10]; + int[] freqs = new int[10]; + int[] posDeltas = new int[10]; + int docUpto = 0; + int posUpto = 0; + int lastDocID = 0; + for(RAMDoc doc : ramTerm.docs) { + if (docDeltas.length == docUpto) { + docDeltas = ArrayUtil.grow(docDeltas, 1+docUpto); + freqs = ArrayUtil.grow(freqs, 1+docUpto); + } + docDeltas[docUpto] = doc.docID - lastDocID; + freqs[docUpto] = doc.positions.length; + docUpto++; + lastDocID = doc.docID; + int lastPos = 0; + for(int pos : doc.positions) { + if (posDeltas.length == posUpto) { + posDeltas = ArrayUtil.grow(posDeltas, 1+posUpto); + } + posDeltas[posUpto++] = pos - lastPos; + lastPos = pos; + } + } + docDeltasReader = new SimpleBlockReader(docDeltas, docUpto); + freqsReader = new SimpleBlockReader(freqs, docUpto); + posDeltasReader = new SimpleBlockReader(posDeltas, posUpto); + } + + @Override + public BlockReader getDocDeltasReader() { + return docDeltasReader; + } + + @Override + public BlockReader getFreqsReader() { + return freqsReader; + } + + @Override + public BlockReader getPositionDeltasReader() { + return posDeltasReader; + } + + @Override + public JumpResult jump(int target, int curCount) { + return null; + } + + private static class SimpleBlockReader extends BlockReader { + private final int[] ints; + private final int count; + private boolean done; + + public SimpleBlockReader(int[] ints, int count) { + this.ints = ints; + this.count = count; + } + + @Override + public int[] getBuffer() { + return ints; + } + + @Override + public int fill() { + if (!done) { + done = true; + return count; + } else { + return 0; + } + } + + @Override + public int end() { + return done ? 0 : count; + } + + @Override + public int offset() { + return 0; + } + + @Override + public void setOffset(int offset) { + throw new UnsupportedOperationException(); + } + } } private static class RAMDocsEnum extends DocsEnum { Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 1044119) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -2892,4 +2892,138 @@ dir.close(); } + + public void testGrowingGaps() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + //w.w.setInfoStream(System.out); + Document doc = new Document(); + Field f = newField(random, "field", "two", Field.Store.NO, Field.Index.ANALYZED); + doc.add(f); + final int NUM_GAPS = 100; + for(int i=0;i= docDeltaMax) { + docDeltaMax = docDeltasReader.fill(); + } + docID = 0; + for(int i=0;i 0 || i==0); + docID += docDeltas[docDeltaUpto++]; + assertEquals(docID, docIDs[i]); + } + + // nocommit test reuse too + // test jump using BulkPostingsEnum: + boolean didJump = false; + for(int i=0;i= docDeltaMax) { + docDeltaMax = docDeltasReader.fill(); + //System.out.println(" do pre-fill"); + } + for(int j=count;j 0); for(int i=0;i<11777;i++) { - assertEquals(i, r.next()); + assertEquals(i, buffer[pointer++]); + if (pointer == pointerMax) { + pointerMax = r.fill(); + assertTrue(pointerMax > 0); + pointer = 0; + } } in.close(); Index: src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java =================================================================== --- src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java (revision 1044119) +++ src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java (working copy) @@ -81,11 +81,14 @@ public void seek(long pos) {} public int readBlock() throws IOException { buffer[0] = in.readVInt(); + //System.out.println("readBlock in=" + in + " fp=" + in.getFilePointer() + ":\n buffer[0]=" + buffer[0]); final int count = buffer[0] <= 3 ? baseBlockSize-1 : 2*baseBlockSize-1; assert buffer.length >= count: "buffer.length=" + buffer.length + " count=" + count; for(int i=0;i