Index: src/test/org/apache/lucene/util/TestBitVector.java =================================================================== --- src/test/org/apache/lucene/util/TestBitVector.java (revision 480218) +++ src/test/org/apache/lucene/util/TestBitVector.java (working copy) @@ -17,6 +17,8 @@ * limitations under the License. */ +import java.io.IOException; + import junit.framework.TestCase; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; @@ -159,6 +161,47 @@ } /** + * Test r/w when size/count cause switching between bit-set and d-gaps file formats. + * @throws Exception + */ + public void testDgaps() throws IOException { + doTestDgaps(1,0,1); + doTestDgaps(10,0,1); + doTestDgaps(100,0,1); + doTestDgaps(1000,4,7); + doTestDgaps(10000,40,43); + doTestDgaps(100000,415,418); + doTestDgaps(1000000,3123,3126); + } + + private void doTestDgaps(int size, int count1, int count2) throws IOException { + Directory d = new RAMDirectory(); + BitVector bv = new BitVector(size); + for (int i=0; i=count1; i--) { + BitVector bv2 = new BitVector(d, "TESTBV"); + assertTrue(doCompare(bv,bv2)); + bv = bv2; + bv.clear(i); + assertEquals(i,bv.count()); + bv.write(d, "TESTBV"); + } + } + /** * Compare two BitVectors. * This should really be an equals method on the BitVector itself. * @param bv One bit vector Index: src/java/org/apache/lucene/util/BitVector.java =================================================================== --- src/java/org/apache/lucene/util/BitVector.java (revision 480218) +++ src/java/org/apache/lucene/util/BitVector.java (working copy) @@ -29,6 +29,7 @@
  • a count() method, which efficiently computes the number of one bits;
  • optimized read from and write to disk;
  • inlinable get() method;
  • +
  • store and load, as bit set or d-gaps, depending on sparseness;
  • @author Doug Cutting @@ -111,27 +112,94 @@ public final void write(Directory d, String name) throws IOException { IndexOutput output = d.createOutput(name); try { - output.writeInt(size()); // write size - output.writeInt(count()); // write count - output.writeBytes(bits, bits.length); // write bits + if (isSparse()) { + writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps. + } else { + writeBits(output); + } } finally { output.close(); } } + + /** Write as a bit set */ + private void writeBits(IndexOutput output) throws IOException { + output.writeInt(size()); // write size + output.writeInt(count()); // write count + output.writeBytes(bits, bits.length); + } + + /** Write as a d-gaps list */ + private void writeDgaps(IndexOutput output) throws IOException { + output.writeInt(-1); // mark using d-gaps + output.writeInt(size()); // write size + output.writeInt(count()); // write count + int last=0; + int n = count(); + int m = bits.length; + for (int i=0; i0; i++) { + if (bits[i]!=0) { + output.writeVInt(i-last); + output.writeByte(bits[i]); + last = i; + n -= BYTE_COUNTS[bits[i] & 0xFF]; + } + } + } + /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or desnse, and should be saved as a bit set. */ + private boolean isSparse() { + // note: order of comparisons below set to favor smaller values (no binary range search.) + // note: adding 4 because we start with ((int) -1) to indicate d-gaps format. + // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore + // multiplying count by (8+8) or (8+16) or (8+24) etc.: + // - first 8 for writing bits[i] (1 byte vs. 1 bit), and + // - second part for writing the byte-number d-gap as vint. + // note: factor is for read/write of byte-arrays being faster than vints. + int factor = 10; + if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size(); + if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size(); + if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size(); + if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size(); + return factor * (4 + (8+40)*count()) < size(); + } + /** Constructs a bit vector from the file name in Directory d, as written by the {@link #write} method. */ public BitVector(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { - size = input.readInt(); // read size - count = input.readInt(); // read count - bits = new byte[(size >> 3) + 1]; // allocate bits - input.readBytes(bits, 0, bits.length); // read bits + size = input.readInt(); // read size + if (size == -1) { + readDgaps(input); + } else { + readBits(input); + } } finally { input.close(); } } + /** Read as a bit set */ + private void readBits(IndexInput input) throws IOException { + count = input.readInt(); // read count + bits = new byte[(size >> 3) + 1]; // allocate bits + input.readBytes(bits, 0, bits.length); + } + + /** read as a d-gaps list */ + private void readDgaps(IndexInput input) throws IOException { + size = input.readInt(); // (re)read size + count = input.readInt(); // read count + bits = new byte[(size >> 3) + 1]; // allocate bits + int last=0; + int n = count(); + while (n>0) { + last += input.readVInt(); + bits[last] = input.readByte(); + n -= BYTE_COUNTS[bits[last] & 0xFF]; + } + } + }