Index: src/test/org/apache/lucene/util/packed/TestPackedInts.java =================================================================== --- src/test/org/apache/lucene/util/packed/TestPackedInts.java (revision 922905) +++ src/test/org/apache/lucene/util/packed/TestPackedInts.java (working copy) @@ -71,10 +71,12 @@ w.add(values[i]); } w.finish(); + final long fp = out.getFilePointer(); out.close(); IndexInput in = d.openInput("out.bin"); PackedInts.Reader r = PackedInts.getReader(in); + assertEquals(fp, in.getFilePointer()); for(int i=0;i termComp; + private final String segment; final HashMap fields = new HashMap(); @@ -80,6 +82,8 @@ this.termComp = termComp; + this.segment = segment; + IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_INDEX_EXTENSION)); boolean success = false; @@ -118,10 +122,13 @@ System.out.println(" read field number=" + field); } final int numIndexTerms = in.readInt(); + final long termsStart = in.readLong(); final long indexStart = in.readLong(); + final long packedIndexStart = in.readLong(); + assert packedIndexStart >= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment; if (numIndexTerms > 0) { final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart)); + fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart)); } } success = true; @@ -130,7 +137,6 @@ in.close(); this.in = null; if (success) { - trimByteBlock(); indexLoaded = true; } } else { @@ -139,48 +145,6 @@ } } - /* Called when index is fully loaded. We know we will use - * no more bytes in the final byte[], so trim it down to - * its actual usagee. This substantially reduces memory - * usage of SegmentReader searching a tiny segment. */ - private final void trimByteBlock() { - if (blockOffset == 0) { - // There were no fields in this segment: - if (blocks != null) { - blocks[blockUpto] = null; - } - } else { - byte[] last = new byte[blockOffset]; - System.arraycopy(blocks[blockUpto], 0, last, 0, blockOffset); - blocks[blockUpto] = last; - } - } - - // TODO: we can record precisely how many bytes are - // required during indexing, save that into file, and be - // precise when we allocate the blocks; we even don't need - // to use blocks anymore (though my still want to, to - // prevent allocation failure due to mem fragmentation on - // 32bit) - - // Fixed size byte blocks, to hold all term bytes; these - // blocks are shared across fields - private byte[][] blocks; - int blockUpto; - int blockOffset; - - private static final int BYTE_BLOCK_SHIFT = 15; - private static final int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT; - private static final int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1; - - static { - // Make sure DW can't ever write a term whose length - // cannot be encoded with short (because we use short[] - // to hold the length of each term). - assert IndexWriter.MAX_TERM_LENGTH < Short.MAX_VALUE; - assert BYTE_BLOCK_SIZE >= IndexWriter.MAX_TERM_LENGTH; - } - private final class FieldIndexReader extends FieldReader { final private FieldInfo fieldInfo; @@ -190,14 +154,18 @@ private final IndexInput in; private final long indexStart; + private final long termsStart; + private final long packedIndexStart; private final int numIndexTerms; - public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart) throws IOException { + public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart) throws IOException { this.fieldInfo = fieldInfo; this.in = in; + this.termsStart = termsStart; this.indexStart = indexStart; + this.packedIndexStart = packedIndexStart; this.numIndexTerms = numIndexTerms; // We still create the indexReader when indexDivisor @@ -210,6 +178,8 @@ } coreIndex = new CoreFieldIndex(indexStart, + termsStart, + packedIndexStart, numIndexTerms); } else { @@ -221,7 +191,7 @@ public void loadTermsIndex() throws IOException { if (coreIndex == null) { - coreIndex = new CoreFieldIndex(indexStart, numIndexTerms); + coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, numIndexTerms); } } @@ -263,36 +233,30 @@ private final class CoreFieldIndex { - // TODO: used packed ints here - // Pointer into terms dict file that we are indexing - final long[] fileOffset; + // holds bytes for all terms + final byte[] termBytes; - // TODO: used packed ints here - // For each term, points to start of term's bytes within - // block. - // TODO: wasteful that this is always long; many terms - // dict indexes obviously don't require so much address - // space; since we know up front during indexing how - // much space is needed we could pack this to the - // precise # bits - final long[] blockPointer; - - // TODO: used packed ints here: we know max term - // length; often its small + // offset into index termBytes + final PackedInts.Reader termOffsets; - // TODO: can we inline this w/ the bytes? like - // DW. vast majority of terms only need 1 byte, not 2 + // index pointers into main terms dict + final PackedInts.Reader termsDictOffsets; - // Length of each term - final short[] termLength; - final int numIndexTerms; - CoreFieldIndex(long indexStart, int numIndexTerms) throws IOException { + final long termsStart; + CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, int numIndexTerms) throws IOException { + + this.termsStart = termsStart; + IndexInput clone = (IndexInput) in.clone(); clone.seek(indexStart); + // nocommit -- must get subsampling working rel. efficiently + //assert indexDivisor == -1 || indexDivisor == 1; + indexDivisor = 1; + if (indexDivisor == -1) { // Special case: we are being loaded inside // IndexWriter because a SegmentReader that at @@ -305,108 +269,23 @@ assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor; - if (blocks == null) { - blocks = new byte[1][]; - blocks[0] = new byte[BYTE_BLOCK_SIZE]; - } + // TODO: this single block may be too big? -- eg + // allocation can fail in 32 bit JRE due to + // fragmentation + // nocommit: hmm this also limits total bytes of terms + // index's terms to 2.1B -- must switch this to + // paged impl? + final int numTermBytes = (int) (packedIndexStart - indexStart); + termBytes = new byte[numTermBytes]; + clone.readBytes(termBytes, 0, numTermBytes, false); - byte[] lastBlock = blocks[blockUpto]; - int lastBlockOffset = blockOffset; + termsDictOffsets = PackedInts.getReader(clone); + assert termsDictOffsets.size() == numIndexTerms; - fileOffset = new long[this.numIndexTerms]; - blockPointer = new long[this.numIndexTerms]; - termLength = new short[this.numIndexTerms]; - - final byte[] skipBytes; - if (indexDivisor != 1) { - // only need skipBytes (below) if we are not - // loading all index terms - skipBytes = new byte[128]; - } else { - skipBytes = null; - } - - int upto = 0; - long pointer = 0; - - for(int i=0;i BYTE_BLOCK_SIZE) { - // New block - final byte[] newBlock = new byte[BYTE_BLOCK_SIZE]; - if (blocks.length == blockUpto+1) { - final int newSize = ArrayUtil.oversize(blockUpto+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF); - final byte[][] newBlocks = new byte[newSize][]; - System.arraycopy(blocks, 0, newBlocks, 0, blocks.length); - blocks = newBlocks; - } - blockUpto++; - blocks[blockUpto] = newBlock; - blockOffset = 0; - } - - final byte[] block = blocks[blockUpto]; - - // Copy old prefix - assert lastBlock != null || start == 0; - assert block != null; - System.arraycopy(lastBlock, lastBlockOffset, block, blockOffset, start); - - // Read new suffix - clone.readBytes(block, blockOffset+start, suffix); - - // Advance file offset - pointer += clone.readVLong(); - - assert thisTermLength < Short.MAX_VALUE; - - termLength[upto] = (short) thisTermLength; - fileOffset[upto] = pointer; - blockPointer[upto] = blockUpto * BYTE_BLOCK_SIZE + blockOffset; - - /* - BytesRef tr = new BytesRef(); - tr.bytes = blocks[blockUpto]; - tr.offset = blockOffset; - tr.length = thisTermLength; - - //System.out.println(" read index term=" + new String(blocks[blockUpto], blockOffset, thisTermLength, "UTF-8") + " this=" + this + " bytes=" + block + " (vs=" + blocks[blockUpto] + ") offset=" + blockOffset); - //System.out.println(" read index term=" + tr.toBytesString() + " this=" + this + " bytes=" + block + " (vs=" + blocks[blockUpto] + ") offset=" + blockOffset); - */ - - lastBlock = block; - lastBlockOffset = blockOffset; - blockOffset += thisTermLength; - upto++; - } else { - // Skip bytes - int toSkip = suffix; - while(true) { - if (toSkip > skipBytes.length) { - clone.readBytes(skipBytes, 0, skipBytes.length); - toSkip -= skipBytes.length; - } else { - clone.readBytes(skipBytes, 0, toSkip); - break; - } - } - - // Advance file offset - pointer += clone.readVLong(); - } - } - + termOffsets = PackedInts.getReader(clone); + assert termOffsets.size() == 1+numIndexTerms; clone.close(); - assert upto == this.numIndexTerms; - if (Codec.DEBUG) { System.out.println(" done read"); } @@ -423,30 +302,28 @@ } private final void fillResult(int idx, TermsIndexResult result) { - final long loc = blockPointer[idx]; - result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)]; - result.term.offset = (int) (loc & BYTE_BLOCK_MASK); - result.term.length = termLength[idx]; + result.term.bytes = termBytes; + result.term.offset = (int) termOffsets.get(idx); + result.term.length = (int) (termOffsets.get(1+idx) - result.term.offset); result.position = idx * totalIndexInterval; - result.offset = fileOffset[idx]; + result.offset = termsStart + termsDictOffsets.get(idx); } public final void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException { if (Codec.DEBUG) { - System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term + " indexLen = " + blockPointer.length + " numIndexTerms=" + fileOffset.length + " numIndexedTerms=" + fileOffset.length); + System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term.utf8ToString()); } int lo = 0; // binary search - int hi = fileOffset.length - 1; + int hi = numIndexTerms - 1; + result.term.bytes = termBytes; while (hi >= lo) { int mid = (lo + hi) >>> 1; - final long loc = blockPointer[mid]; - result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)]; - result.term.offset = (int) (loc & BYTE_BLOCK_MASK); - result.term.length = termLength[mid]; + result.term.offset = (int) termOffsets.get(mid); + result.term.length = (int) (termOffsets.get(1+mid) - result.term.offset); int delta = termComp.compare(term, result.term); if (delta < 0) { @@ -456,7 +333,7 @@ } else { assert mid >= 0; result.position = mid*totalIndexInterval; - result.offset = fileOffset[mid]; + result.offset = termsStart + termsDictOffsets.get(mid); return; } } @@ -465,13 +342,11 @@ hi = 0; } - final long loc = blockPointer[hi]; - result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)]; - result.term.offset = (int) (loc & BYTE_BLOCK_MASK); - result.term.length = termLength[hi]; + result.term.offset = (int) termOffsets.get(hi); + result.term.length = (int) (termOffsets.get(1+hi) - result.term.offset); result.position = hi*totalIndexInterval; - result.offset = fileOffset[hi]; + result.offset = termsStart + termsDictOffsets.get(hi); } public final void getIndexOffset(long ord, TermsIndexResult result) throws IOException { @@ -498,7 +373,6 @@ while(it.hasNext()) { it.next().loadTermsIndex(); } - trimByteBlock(); indexLoaded = true; in.close(); Index: src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java (revision 922905) +++ src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java (working copy) @@ -29,6 +29,7 @@ public abstract class FieldWriter { public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException; + public abstract void finish() throws IOException; } public abstract FieldWriter addField(FieldInfo fieldInfo); Index: src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java =================================================================== --- src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (revision 922905) +++ src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (working copy) @@ -372,7 +372,7 @@ indexReader.getIndexOffset(term, indexResult); if (Codec.DEBUG) { - Codec.debug(" index pos=" + indexResult.position + " termFP=" + indexResult.offset + " term=" + indexResult.term + " this=" + this); + Codec.debug(" index pos=" + indexResult.position + " termFP=" + indexResult.offset + " term=" + indexResult.term.utf8ToString() + " this=" + this); } in.seek(indexResult.offset); @@ -507,6 +507,9 @@ } if (state.ord >= numTerms-1) { + if (Codec.DEBUG) { + Codec.debug(" return null ord=" + state.ord + " vs numTerms-1=" + (numTerms-1)); + } return null; } @@ -514,7 +517,7 @@ state.docFreq = in.readVInt(); if (Codec.DEBUG) { - Codec.debug(" text=" + bytesReader.term + " freq=" + state.docFreq + " tis=" + in); + Codec.debug(" text=" + bytesReader.term.utf8ToString() + " freq=" + state.docFreq + " tis=" + in); } // TODO: would be cleaner, but space-wasting, to Index: src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (revision 922905) +++ src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (working copy) @@ -25,6 +25,8 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; import java.util.List; import java.util.ArrayList; @@ -58,7 +60,6 @@ // Placeholder for dir offset out.writeLong(0); out.writeInt(termIndexInterval); - termWriter = new DeltaBytesWriter(out); } @Override @@ -66,8 +67,6 @@ this.termsOut = termsOut; } - final private DeltaBytesWriter termWriter; - @Override public FieldWriter addField(FieldInfo field) { SimpleFieldWriter writer = new SimpleFieldWriter(field); @@ -78,33 +77,96 @@ private class SimpleFieldWriter extends FieldWriter { final FieldInfo fieldInfo; int numIndexTerms; - private long lastTermsPointer; final long indexStart; + final long termsStart; + long packedIndexStart; private int numTerms; + // TODO: we could conceivably make a PackedInts wrapper + // that auto-grows... then we wouldn't force 6 bytes RAM + // per index term: + private short[] termLengths; + private int[] termsPointerDeltas; + private long lastTermsPointer; + private long totTermLength; + SimpleFieldWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; indexStart = out.getFilePointer(); - termWriter.reset(); + termsStart = lastTermsPointer = termsOut.getFilePointer(); + termLengths = new short[0]; + termsPointerDeltas = new int[0]; } @Override public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { // First term is first indexed term: if (0 == (numTerms++ % termIndexInterval)) { - final long termsPointer = termsOut.getFilePointer(); + if (Codec.DEBUG) { - Codec.debug("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text + " termsFP=" + termsPointer + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer()); + Codec.debug("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text.utf8ToString() + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer()); } - termWriter.write(text); - out.writeVLong(termsPointer - lastTermsPointer); - lastTermsPointer = termsPointer; + + // write full bytes + out.writeBytes(text.bytes, text.offset, text.length); + + if (termLengths.length == numIndexTerms) { + termLengths = ArrayUtil.grow(termLengths); + } + if (termsPointerDeltas.length == numIndexTerms) { + termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas); + } + + // save delta terms pointer + final long fp = termsOut.getFilePointer(); + termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer); + lastTermsPointer = fp; + + // save term length (in bytes) + assert text.length <= Short.MAX_VALUE; + termLengths[numIndexTerms] = (short) text.length; + + totTermLength += text.length; + numIndexTerms++; return true; } else { return false; } } + + @Override + public void finish() throws IOException { + + // write primary terms dict offsets + packedIndexStart = out.getFilePointer(); + + final long maxValue = termsOut.getFilePointer(); + PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue)); + + // relative to our indexStart + long upto = 0; + for(int i=0;i 0x1FFFFFFFFFFFFFFFL) { return 62; } - return (int) Math.ceil(Math.log(1+maxValue)/Math.log(2.0)); + return Math.max(1, (int) Math.ceil(Math.log(1+maxValue)/Math.log(2.0))); } /** Index: src/java/org/apache/lucene/util/packed/Packed32.java =================================================================== --- src/java/org/apache/lucene/util/packed/Packed32.java (revision 922905) +++ src/java/org/apache/lucene/util/packed/Packed32.java (working copy) @@ -129,6 +129,7 @@ super(valueCount, bitsPerValue); int size = size(bitsPerValue, valueCount); blocks = new int[size + 1]; // +1 due to non-conditional tricks + // TODO: find a faster way to bulk-read ints... for(int i = 0 ; i < size ; i++) { blocks[i] = in.readInt(); } Index: src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- src/java/org/apache/lucene/util/ArrayUtil.java (revision 922905) +++ src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -232,6 +232,29 @@ return currentSize; } + public static short[] grow(short[] array, int minSize) { + if (array.length < minSize) { + short[] newArray = new short[oversize(minSize, RamUsageEstimator.NUM_BYTES_SHORT)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static short[] grow(short[] array) { + return grow(array, 1 + array.length); + } + + public static short[] shrink(short[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); + if (newSize != array.length) { + short[] newArray = new short[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + public static int[] grow(int[] array, int minSize) { if (array.length < minSize) { int[] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_INT)];