Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1140596) +++ lucene/CHANGES.txt (working copy) @@ -489,6 +489,10 @@ * LUCENE-3251: Directory#copy failed to close target output if opening the source stream failed. (Simon Willnauer) +* LUCENE-3254: Fixed minor bug in deletes were written to disk, + causing the file to sometimes be larger than it needed to be. (Mike + McCandless) + Optimizations * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated Index: lucene/src/test/org/apache/lucene/util/TestBitVector.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestBitVector.java (revision 1140596) +++ lucene/src/test/org/apache/lucene/util/TestBitVector.java (working copy) @@ -211,70 +211,4 @@ } return equal; } - - private static int[] subsetPattern = new int[] { 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1 }; - - /** - * Tests BitVector.subset() against the above pattern - */ - public void testSubset() { - doTestSubset(0, 0); - doTestSubset(0, 20); - doTestSubset(0, 7); - doTestSubset(0, 8); - doTestSubset(0, 9); - doTestSubset(0, 15); - doTestSubset(0, 16); - doTestSubset(0, 17); - doTestSubset(1, 7); - doTestSubset(1, 8); - doTestSubset(1, 9); - doTestSubset(1, 15); - doTestSubset(1, 16); - doTestSubset(1, 17); - doTestSubset(2, 20); - doTestSubset(3, 20); - doTestSubset(4, 20); - doTestSubset(5, 20); - doTestSubset(6, 20); - doTestSubset(7, 14); - doTestSubset(7, 15); - doTestSubset(7, 16); - doTestSubset(8, 15); - doTestSubset(9, 20); - doTestSubset(10, 20); - doTestSubset(11, 20); - doTestSubset(12, 20); - doTestSubset(13, 20); - } - - /** - * Compare a subset against the corresponding portion of the test pattern - */ - private void doTestSubset(int start, int end) { - BitVector full = createSubsetTestVector(); - BitVector subset = full.subset(start, end); - assertEquals(end - start, subset.size()); - int count = 0; - for (int i = start, j = 0; i < end; i++, j++) { - if (subsetPattern[i] == 1) { - count++; - assertTrue(subset.get(j)); - } else { - assertFalse(subset.get(j)); - } - } - assertEquals(count, subset.count()); - } - - private BitVector createSubsetTestVector() { - BitVector bv = new BitVector(subsetPattern.length); - for (int i = 0; i < subsetPattern.length; i++) { - if (subsetPattern[i] == 1) { - bv.set(i); - } - } - return bv; - } - } Index: lucene/src/java/org/apache/lucene/util/BitVector.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BitVector.java (revision 1140596) +++ lucene/src/java/org/apache/lucene/util/BitVector.java (working copy) @@ -24,14 +24,16 @@ import org.apache.lucene.store.IndexOutput; /** Optimized implementation of a vector of bits. This is more-or-less like - java.util.BitSet, but also includes the following: - - */ + * java.util.BitSet, but also includes the following: + * + * + * @lucene.internal + */ public final class BitVector implements Cloneable, Bits { private byte[] bits; @@ -41,16 +43,24 @@ /** Constructs a vector capable of holding n bits. */ public BitVector(int n) { size = n; - bits = new byte[(size >> 3) + 1]; + bits = new byte[getNumBytes(size)]; count = 0; } - + BitVector(byte[] bits, int size) { this.bits = bits; this.size = size; count = -1; } + private int getNumBytes(int size) { + int bytesLength = size >>> 3; + if ((size & 7) != 0) { + bytesLength++; + } + return bytesLength; + } + @Override public Object clone() { byte[] copyBits = new byte[bits.length]; @@ -158,13 +168,25 @@ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; + private static String CODEC = "BitVector"; + // Version before version tracking was added: + private final static int VERSION_PRE = -1; + + // First version: + private final static int VERSION_START = 0; + + // Increment version to change it: + private final static int VERSION_CURRENT = VERSION_START; + /** Writes this vector to the file name in Directory d, in a format that can be read by the constructor {@link #BitVector(Directory, String)}. */ public final void write(Directory d, String name) throws IOException { IndexOutput output = d.createOutput(name); try { + output.writeInt(-2); + CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT); if (isSparse()) { writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps. } else { @@ -202,19 +224,38 @@ /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */ private boolean isSparse() { - // note: order of comparisons below set to favor smaller values (no binary range search.) - // note: adding 4 because we start with ((int) -1) to indicate d-gaps format. - // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore - // multiplying count by (8+8) or (8+16) or (8+24) etc.: - // - first 8 for writing bits[i] (1 byte vs. 1 bit), and - // - second part for writing the byte-number d-gap as vint. + + final int setCount = count(); + if (setCount == 0) { + return true; + } + + final int avgGapLength = bits.length / setCount; + + // expected number of bytes for vInt encoding of each gap + final int expectedDGapBytes; + if (avgGapLength <= (1<< 7)) { + expectedDGapBytes = 1; + } else if (avgGapLength <= (1<<14)) { + expectedDGapBytes = 2; + } else if (avgGapLength <= (1<<21)) { + expectedDGapBytes = 3; + } else if (avgGapLength <= (1<<28)) { + expectedDGapBytes = 4; + } else { + expectedDGapBytes = 5; + } + + // +1 because we write the byte itself that contains the + // set bit + final int bytesPerSetBit = expectedDGapBytes + 1; + + // note: adding 32 because we start with ((int) -1) to indicate d-gaps format. + final long expectedBits = 32 + 8 * bytesPerSetBit * count(); + // note: factor is for read/write of byte-arrays being faster than vints. - int factor = 10; - if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size(); - if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size(); - if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size(); - if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size(); - return factor * (4 + (8+40)*count()) < size(); + final long factor = 10; + return factor * expectedBits < size(); } /** Constructs a bit vector from the file name in Directory @@ -222,8 +263,18 @@ */ public BitVector(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); + try { - size = input.readInt(); // read size + final int firstInt = input.readInt(); + final int version; + if (firstInt == -2) { + // New format, with full header & version: + version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START); + size = input.readInt(); + } else { + version = VERSION_PRE; + size = firstInt; + } if (size == -1) { readDgaps(input); } else { @@ -237,7 +288,7 @@ /** Read as a bit set */ private void readBits(IndexInput input) throws IOException { count = input.readInt(); // read count - bits = new byte[(size >> 3) + 1]; // allocate bits + bits = new byte[getNumBytes(size)]; // allocate bits input.readBytes(bits, 0, bits.length); } @@ -254,30 +305,4 @@ n -= BYTE_COUNTS[bits[last] & 0xFF]; } } - - /** - * Retrieve a subset of this BitVector. - * - * @param start - * starting index, inclusive - * @param end - * ending index, exclusive - * @return subset - */ - public BitVector subset(int start, int end) { - if (start < 0 || end > size() || end < start) - throw new IndexOutOfBoundsException(); - // Special case -- return empty vector is start == end - if (end == start) return new BitVector(0); - byte[] bits = new byte[((end - start - 1) >>> 3) + 1]; - int s = start >>> 3; - for (int i = 0; i < bits.length; i++) { - int cur = 0xFF & this.bits[i + s]; - int next = i + s + 1 >= this.bits.length ? 0 : 0xFF & this.bits[i + s + 1]; - bits[i] = (byte) ((cur >>> (start & 7)) | ((next << (8 - (start & 7))))); - } - int bitsToClear = (bits.length * 8 - (end - start)) % 8; - bits[bits.length - 1] &= ~(0xFF << (8 - bitsToClear)); - return new BitVector(bits, end - start); - } }