Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1140596) +++ lucene/CHANGES.txt (working copy) @@ -489,6 +489,10 @@ * LUCENE-3251: Directory#copy failed to close target output if opening the source stream failed. (Simon Willnauer) +* LUCENE-3254: Fixed minor bug in deletes were written to disk, + causing the file to sometimes be larger than it needed to be. (Mike + McCandless) + Optimizations * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated Index: lucene/src/test/org/apache/lucene/util/TestBitVector.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestBitVector.java (revision 1140596) +++ lucene/src/test/org/apache/lucene/util/TestBitVector.java (working copy) @@ -211,70 +211,4 @@ } return equal; } - - private static int[] subsetPattern = new int[] { 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1 }; - - /** - * Tests BitVector.subset() against the above pattern - */ - public void testSubset() { - doTestSubset(0, 0); - doTestSubset(0, 20); - doTestSubset(0, 7); - doTestSubset(0, 8); - doTestSubset(0, 9); - doTestSubset(0, 15); - doTestSubset(0, 16); - doTestSubset(0, 17); - doTestSubset(1, 7); - doTestSubset(1, 8); - doTestSubset(1, 9); - doTestSubset(1, 15); - doTestSubset(1, 16); - doTestSubset(1, 17); - doTestSubset(2, 20); - doTestSubset(3, 20); - doTestSubset(4, 20); - doTestSubset(5, 20); - doTestSubset(6, 20); - doTestSubset(7, 14); - doTestSubset(7, 15); - doTestSubset(7, 16); - doTestSubset(8, 15); - doTestSubset(9, 20); - doTestSubset(10, 20); - doTestSubset(11, 20); - doTestSubset(12, 20); - doTestSubset(13, 20); - } - - /** - * Compare a subset against the corresponding portion of the test pattern - */ - private void doTestSubset(int start, int end) { - BitVector full = createSubsetTestVector(); - BitVector subset = full.subset(start, end); - assertEquals(end - start, subset.size()); - int count = 0; - for (int i = start, j = 0; i < end; i++, j++) { - if (subsetPattern[i] == 1) { - count++; - assertTrue(subset.get(j)); - } else { - assertFalse(subset.get(j)); - } - } - assertEquals(count, subset.count()); - } - - private BitVector createSubsetTestVector() { - BitVector bv = new BitVector(subsetPattern.length); - for (int i = 0; i < subsetPattern.length; i++) { - if (subsetPattern[i] == 1) { - bv.set(i); - } - } - return bv; - } - } Index: lucene/src/java/org/apache/lucene/util/BitVector.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BitVector.java (revision 1140596) +++ lucene/src/java/org/apache/lucene/util/BitVector.java (working copy) @@ -24,14 +24,16 @@ import org.apache.lucene.store.IndexOutput; /** Optimized implementation of a vector of bits. This is more-or-less like - java.util.BitSet, but also includes the following: -
n bits. */
public BitVector(int n) {
size = n;
- bits = new byte[(size >> 3) + 1];
+ bits = new byte[getNumBytes(size)];
count = 0;
}
-
+
BitVector(byte[] bits, int size) {
this.bits = bits;
this.size = size;
count = -1;
}
+ private int getNumBytes(int size) {
+ int bytesLength = size >>> 3;
+ if ((size & 7) != 0) {
+ bytesLength++;
+ }
+ return bytesLength;
+ }
+
@Override
public Object clone() {
byte[] copyBits = new byte[bits.length];
@@ -158,13 +168,25 @@
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
+ private static String CODEC = "BitVector";
+ // Version before version tracking was added:
+ private final static int VERSION_PRE = -1;
+
+ // First version:
+ private final static int VERSION_START = 0;
+
+ // Increment version to change it:
+ private final static int VERSION_CURRENT = VERSION_START;
+
/** Writes this vector to the file name in Directory
d, in a format that can be read by the constructor {@link
#BitVector(Directory, String)}. */
public final void write(Directory d, String name) throws IOException {
IndexOutput output = d.createOutput(name);
try {
+ output.writeInt(-2);
+ CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT);
if (isSparse()) {
writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
} else {
@@ -202,19 +224,38 @@
/** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
private boolean isSparse() {
- // note: order of comparisons below set to favor smaller values (no binary range search.)
- // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
- // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
- // multiplying count by (8+8) or (8+16) or (8+24) etc.:
- // - first 8 for writing bits[i] (1 byte vs. 1 bit), and
- // - second part for writing the byte-number d-gap as vint.
+
+ final int setCount = count();
+ if (setCount == 0) {
+ return true;
+ }
+
+ final int avgGapLength = bits.length / setCount;
+
+ // expected number of bytes for vInt encoding of each gap
+ final int expectedDGapBytes;
+ if (avgGapLength <= (1<< 7)) {
+ expectedDGapBytes = 1;
+ } else if (avgGapLength <= (1<<14)) {
+ expectedDGapBytes = 2;
+ } else if (avgGapLength <= (1<<21)) {
+ expectedDGapBytes = 3;
+ } else if (avgGapLength <= (1<<28)) {
+ expectedDGapBytes = 4;
+ } else {
+ expectedDGapBytes = 5;
+ }
+
+ // +1 because we write the byte itself that contains the
+ // set bit
+ final int bytesPerSetBit = expectedDGapBytes + 1;
+
+ // note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
+ final long expectedBits = 32 + 8 * bytesPerSetBit * count();
+
// note: factor is for read/write of byte-arrays being faster than vints.
- int factor = 10;
- if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
- if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size();
- if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size();
- if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size();
- return factor * (4 + (8+40)*count()) < size();
+ final long factor = 10;
+ return factor * expectedBits < size();
}
/** Constructs a bit vector from the file name in Directory
@@ -222,8 +263,18 @@
*/
public BitVector(Directory d, String name) throws IOException {
IndexInput input = d.openInput(name);
+
try {
- size = input.readInt(); // read size
+ final int firstInt = input.readInt();
+ final int version;
+ if (firstInt == -2) {
+ // New format, with full header & version:
+ version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START);
+ size = input.readInt();
+ } else {
+ version = VERSION_PRE;
+ size = firstInt;
+ }
if (size == -1) {
readDgaps(input);
} else {
@@ -237,7 +288,7 @@
/** Read as a bit set */
private void readBits(IndexInput input) throws IOException {
count = input.readInt(); // read count
- bits = new byte[(size >> 3) + 1]; // allocate bits
+ bits = new byte[getNumBytes(size)]; // allocate bits
input.readBytes(bits, 0, bits.length);
}
@@ -254,30 +305,4 @@
n -= BYTE_COUNTS[bits[last] & 0xFF];
}
}
-
- /**
- * Retrieve a subset of this BitVector.
- *
- * @param start
- * starting index, inclusive
- * @param end
- * ending index, exclusive
- * @return subset
- */
- public BitVector subset(int start, int end) {
- if (start < 0 || end > size() || end < start)
- throw new IndexOutOfBoundsException();
- // Special case -- return empty vector is start == end
- if (end == start) return new BitVector(0);
- byte[] bits = new byte[((end - start - 1) >>> 3) + 1];
- int s = start >>> 3;
- for (int i = 0; i < bits.length; i++) {
- int cur = 0xFF & this.bits[i + s];
- int next = i + s + 1 >= this.bits.length ? 0 : 0xFF & this.bits[i + s + 1];
- bits[i] = (byte) ((cur >>> (start & 7)) | ((next << (8 - (start & 7)))));
- }
- int bitsToClear = (bits.length * 8 - (end - start)) % 8;
- bits[bits.length - 1] &= ~(0xFF << (8 - bitsToClear));
- return new BitVector(bits, end - start);
- }
}