diff --git a/metastore/src/gen/protobuf/gen-java/org/apache/hadoop/hive/metastore/hbase/HbaseMetastoreProto.java b/metastore/src/gen/protobuf/gen-java/org/apache/hadoop/hive/metastore/hbase/HbaseMetastoreProto.java index 03e492ec3d..7011f6ff4b 100644 --- a/metastore/src/gen/protobuf/gen-java/org/apache/hadoop/hive/metastore/hbase/HbaseMetastoreProto.java +++ b/metastore/src/gen/protobuf/gen-java/org/apache/hadoop/hive/metastore/hbase/HbaseMetastoreProto.java @@ -1056,19 +1056,15 @@ public AggrStatsBloomFilter parsePartialFrom( */ int getNumFuncs(); - // repeated int64 bits = 3; + // required bytes bits = 3; /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - java.util.List getBitsList(); + boolean hasBits(); /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - int getBitsCount(); - /** - * repeated int64 bits = 3; - */ - long getBits(int index); + com.google.protobuf.ByteString getBits(); } /** * Protobuf type {@code org.apache.hadoop.hive.metastore.hbase.AggrStatsBloomFilter.BloomFilter} @@ -1131,25 +1127,9 @@ private BloomFilter( numFuncs_ = input.readInt32(); break; } - case 24: { - if (!((mutable_bitField0_ & 0x00000004) == 0x00000004)) { - bits_ = new java.util.ArrayList(); - mutable_bitField0_ |= 0x00000004; - } - bits_.add(input.readInt64()); - break; - } case 26: { - int length = input.readRawVarint32(); - int limit = input.pushLimit(length); - if (!((mutable_bitField0_ & 0x00000004) == 0x00000004) && input.getBytesUntilLimit() > 0) { - bits_ = new java.util.ArrayList(); - mutable_bitField0_ |= 0x00000004; - } - while (input.getBytesUntilLimit() > 0) { - bits_.add(input.readInt64()); - } - input.popLimit(limit); + bitField0_ |= 0x00000004; + bits_ = input.readBytes(); break; } } @@ -1160,9 +1140,6 @@ private BloomFilter( throw new com.google.protobuf.InvalidProtocolBufferException( e.getMessage()).setUnfinishedMessage(this); } finally { - if (((mutable_bitField0_ & 0x00000004) == 0x00000004)) { - bits_ = java.util.Collections.unmodifiableList(bits_); - } this.unknownFields = unknownFields.build(); makeExtensionsImmutable(); } @@ -1227,33 +1204,26 @@ public int getNumFuncs() { return numFuncs_; } - // repeated int64 bits = 3; + // required bytes bits = 3; public static final int BITS_FIELD_NUMBER = 3; - private java.util.List bits_; - /** - * repeated int64 bits = 3; - */ - public java.util.List - getBitsList() { - return bits_; - } + private com.google.protobuf.ByteString bits_; /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - public int getBitsCount() { - return bits_.size(); + public boolean hasBits() { + return ((bitField0_ & 0x00000004) == 0x00000004); } /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - public long getBits(int index) { - return bits_.get(index); + public com.google.protobuf.ByteString getBits() { + return bits_; } private void initFields() { numBits_ = 0; numFuncs_ = 0; - bits_ = java.util.Collections.emptyList(); + bits_ = com.google.protobuf.ByteString.EMPTY; } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { @@ -1268,6 +1238,10 @@ public final boolean isInitialized() { memoizedIsInitialized = 0; return false; } + if (!hasBits()) { + memoizedIsInitialized = 0; + return false; + } memoizedIsInitialized = 1; return true; } @@ -1281,8 +1255,8 @@ public void writeTo(com.google.protobuf.CodedOutputStream output) if (((bitField0_ & 0x00000002) == 0x00000002)) { output.writeInt32(2, numFuncs_); } - for (int i = 0; i < bits_.size(); i++) { - output.writeInt64(3, bits_.get(i)); + if (((bitField0_ & 0x00000004) == 0x00000004)) { + output.writeBytes(3, bits_); } getUnknownFields().writeTo(output); } @@ -1301,14 +1275,9 @@ public int getSerializedSize() { size += com.google.protobuf.CodedOutputStream .computeInt32Size(2, numFuncs_); } - { - int dataSize = 0; - for (int i = 0; i < bits_.size(); i++) { - dataSize += com.google.protobuf.CodedOutputStream - .computeInt64SizeNoTag(bits_.get(i)); - } - size += dataSize; - size += 1 * getBitsList().size(); + if (((bitField0_ & 0x00000004) == 0x00000004)) { + size += com.google.protobuf.CodedOutputStream + .computeBytesSize(3, bits_); } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; @@ -1430,7 +1399,7 @@ public Builder clear() { bitField0_ = (bitField0_ & ~0x00000001); numFuncs_ = 0; bitField0_ = (bitField0_ & ~0x00000002); - bits_ = java.util.Collections.emptyList(); + bits_ = com.google.protobuf.ByteString.EMPTY; bitField0_ = (bitField0_ & ~0x00000004); return this; } @@ -1468,9 +1437,8 @@ public Builder clone() { to_bitField0_ |= 0x00000002; } result.numFuncs_ = numFuncs_; - if (((bitField0_ & 0x00000004) == 0x00000004)) { - bits_ = java.util.Collections.unmodifiableList(bits_); - bitField0_ = (bitField0_ & ~0x00000004); + if (((from_bitField0_ & 0x00000004) == 0x00000004)) { + to_bitField0_ |= 0x00000004; } result.bits_ = bits_; result.bitField0_ = to_bitField0_; @@ -1495,15 +1463,8 @@ public Builder mergeFrom(org.apache.hadoop.hive.metastore.hbase.HbaseMetastorePr if (other.hasNumFuncs()) { setNumFuncs(other.getNumFuncs()); } - if (!other.bits_.isEmpty()) { - if (bits_.isEmpty()) { - bits_ = other.bits_; - bitField0_ = (bitField0_ & ~0x00000004); - } else { - ensureBitsIsMutable(); - bits_.addAll(other.bits_); - } - onChanged(); + if (other.hasBits()) { + setBits(other.getBits()); } this.mergeUnknownFields(other.getUnknownFields()); return this; @@ -1518,6 +1479,10 @@ public final boolean isInitialized() { return false; } + if (!hasBits()) { + + return false; + } return true; } @@ -1606,68 +1571,38 @@ public Builder clearNumFuncs() { return this; } - // repeated int64 bits = 3; - private java.util.List bits_ = java.util.Collections.emptyList(); - private void ensureBitsIsMutable() { - if (!((bitField0_ & 0x00000004) == 0x00000004)) { - bits_ = new java.util.ArrayList(bits_); - bitField0_ |= 0x00000004; - } - } - /** - * repeated int64 bits = 3; - */ - public java.util.List - getBitsList() { - return java.util.Collections.unmodifiableList(bits_); - } + // required bytes bits = 3; + private com.google.protobuf.ByteString bits_ = com.google.protobuf.ByteString.EMPTY; /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - public int getBitsCount() { - return bits_.size(); - } - /** - * repeated int64 bits = 3; - */ - public long getBits(int index) { - return bits_.get(index); - } - /** - * repeated int64 bits = 3; - */ - public Builder setBits( - int index, long value) { - ensureBitsIsMutable(); - bits_.set(index, value); - onChanged(); - return this; + public boolean hasBits() { + return ((bitField0_ & 0x00000004) == 0x00000004); } /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - public Builder addBits(long value) { - ensureBitsIsMutable(); - bits_.add(value); - onChanged(); - return this; + public com.google.protobuf.ByteString getBits() { + return bits_; } /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ - public Builder addAllBits( - java.lang.Iterable values) { - ensureBitsIsMutable(); - super.addAll(values, bits_); + public Builder setBits(com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + bitField0_ |= 0x00000004; + bits_ = value; onChanged(); return this; } /** - * repeated int64 bits = 3; + * required bytes bits = 3; */ public Builder clearBits() { - bits_ = java.util.Collections.emptyList(); bitField0_ = (bitField0_ & ~0x00000004); + bits_ = getDefaultInstance().getBits(); onChanged(); return this; } @@ -41545,7 +41480,7 @@ public Builder removeFks(int index) { "hive.metastore.hbase.AggrStatsBloomFilte" + "r.BloomFilter\022\025\n\raggregated_at\030\004 \002(\003\032@\n\013" + "BloomFilter\022\020\n\010num_bits\030\001 \002(\005\022\021\n\tnum_fun", - "cs\030\002 \002(\005\022\014\n\004bits\030\003 \003(\003\"\357\001\n\032AggrStatsInva" + + "cs\030\002 \002(\005\022\014\n\004bits\030\003 \002(\014\"\357\001\n\032AggrStatsInva" + "lidatorFilter\022_\n\rto_invalidate\030\001 \003(\0132H.o" + "rg.apache.hadoop.hive.metastore.hbase.Ag" + "grStatsInvalidatorFilter.Entry\022\021\n\trun_ev" + diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/AggrStatsInvalidatorFilter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/AggrStatsInvalidatorFilter.java index 4ca4229acd..6ae0266843 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/AggrStatsInvalidatorFilter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/AggrStatsInvalidatorFilter.java @@ -101,10 +101,11 @@ public ReturnCode filterKeyValue(Cell cell) throws IOException { entry.getTableName().equals(fromCol.getTableName())) { if (bloom == null) { // Now, reconstitute the bloom filter and probe it with each of our partition names - bloom = new BloomFilter( - fromCol.getBloomFilter().getBitsList(), - fromCol.getBloomFilter().getNumBits(), - fromCol.getBloomFilter().getNumFuncs()); + + bloom = new BloomFilter( + fromCol.getBloomFilter().getBits().toByteArray(), + fromCol.getBloomFilter().getNumBits(), + fromCol.getBloomFilter().getNumFuncs()); } if (bloom.test(entry.getPartName().toByteArray())) { // This is most likely a match, so mark it and quit looking. diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/HBaseUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/HBaseUtils.java index 94087b164e..6a97a43a27 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/HBaseUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/HBaseUtils.java @@ -1214,14 +1214,13 @@ static StorageDescriptorParts deserializeIndex(String dbName, String origTableNa } static byte[] serializeBloomFilter(String dbName, String tableName, BloomFilter bloom) { - long[] bitSet = bloom.getBitSet(); - List bits = new ArrayList<>(bitSet.length); - for (int i = 0; i < bitSet.length; i++) bits.add(bitSet[i]); + byte[] bits = bloom.getBitSet(); + ByteString bstr = ByteString.copyFrom(bits); HbaseMetastoreProto.AggrStatsBloomFilter.BloomFilter protoBloom = HbaseMetastoreProto.AggrStatsBloomFilter.BloomFilter.newBuilder() .setNumBits(bloom.getBitSize()) .setNumFuncs(bloom.getNumHashFunctions()) - .addAllBits(bits) + .setBits(bstr) .build(); HbaseMetastoreProto.AggrStatsBloomFilter proto = diff --git a/metastore/src/protobuf/org/apache/hadoop/hive/metastore/hbase/hbase_metastore_proto.proto b/metastore/src/protobuf/org/apache/hadoop/hive/metastore/hbase/hbase_metastore_proto.proto index 6499ac6d23..9cee8cfd0f 100644 --- a/metastore/src/protobuf/org/apache/hadoop/hive/metastore/hbase/hbase_metastore_proto.proto +++ b/metastore/src/protobuf/org/apache/hadoop/hive/metastore/hbase/hbase_metastore_proto.proto @@ -31,7 +31,7 @@ message AggrStatsBloomFilter { message BloomFilter { required int32 num_bits = 1; required int32 num_funcs = 2; - repeated int64 bits = 3; + required bytes bits = 3; } required bytes db_name = 1; required bytes table_name = 2; diff --git a/pom.xml b/pom.xml index 5ec6befee6..d470423a26 100644 --- a/pom.xml +++ b/pom.xml @@ -187,7 +187,7 @@ 1.0.1 1.7.10 4.0.4 - 2.3.0-SNAPSHOT + 3.0.0-SNAPSHOT 0.8.4 0.90.2-incubating 2.2.0 @@ -850,6 +850,11 @@ jamon-runtime ${jamon-runtime.version} + + org.roaringbitmap + RoaringBitmap + 0.6.42 + diff --git a/storage-api/pom.xml b/storage-api/pom.xml index d0bf08813e..a6c3b4700d 100644 --- a/storage-api/pom.xml +++ b/storage-api/pom.xml @@ -115,6 +115,11 @@ ${junit.version} test + + org.roaringbitmap + RoaringBitmap + 0.6.42 + diff --git a/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java b/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java index e9f419dcbe..178172f57f 100644 --- a/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java +++ b/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import org.roaringbitmap.RoaringBitmap; /** * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are @@ -43,7 +44,7 @@ */ public class BloomFilter { public static final double DEFAULT_FPP = 0.05; - protected BitSet bitSet; + protected RoaringBitmap bitMap; protected int numBits; protected int numHashFunctions; @@ -67,7 +68,7 @@ public BloomFilter(long expectedEntries, double fpp) { // make 'm' multiple of 64 this.numBits = nb + (Long.SIZE - (nb % Long.SIZE)); this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits); - this.bitSet = new BitSet(numBits); + this.bitMap = new RoaringBitmap(); } /** @@ -76,14 +77,37 @@ public BloomFilter(long expectedEntries, double fpp) { * @param numBits * @param numFuncs */ - public BloomFilter(List bits, int numBits, int numFuncs) { + public BloomFilter(final byte[] bits, int numBits, int numFuncs) { super(); - long[] copied = new long[bits.size()]; - for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i); - bitSet = new BitSet(copied); - this.numBits = numBits; - numHashFunctions = numFuncs; - } + bitMap = new RoaringBitmap(); + try { + bitMap.deserialize( new DataInputStream(new InputStream () { + int idx = 0; + public int read() { + return bits[idx++]; + } + public int read(byte[] b) { + return read(b, 0, b.length); + } + public int read(byte[] b, int off, int l) { + for (int i = off; i < (off + l); i++) {b[i] = bits[idx++];} + return l; + } + })); + } catch (IOException e) { + throw new RuntimeException("unexpected error while deserializing from a byte array"); + } + this.numBits = numBits; + this.numHashFunctions = numFuncs; + } + + public BloomFilter(DataInputStream in, int numBits, int numFuncs) throws IOException { + super(); + bitMap = new RoaringBitmap(); + bitMap.deserialize(in); + this.numBits = numBits; + this.numHashFunctions = numFuncs; + } static int optimalNumOfHashFunctions(long n, long m) { return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); @@ -124,7 +148,7 @@ private void addHash(long hash64) { combinedHash = ~combinedHash; } int pos = combinedHash % numBits; - bitSet.set(pos); + bitMap.add(pos); } } @@ -168,7 +192,7 @@ private boolean testHash(long hash64) { combinedHash = ~combinedHash; } int pos = combinedHash % numBits; - if (!bitSet.get(pos)) { + if (!bitMap.contains(pos)) { return false; } } @@ -209,16 +233,35 @@ public long sizeInBytes() { } public int getBitSize() { - return bitSet.getData().length * Long.SIZE; + return bitMap.serializedSizeInBytes(); } public int getNumHashFunctions() { return numHashFunctions; } - public long[] getBitSet() { - return bitSet.getData(); - } + public byte[] getBitSet() { + bitMap.runOptimize(); + final byte[] ret = new byte[bitMap.serializedSizeInBytes()]; + try { + bitMap.serialize(new DataOutputStream(new OutputStream() { + int idx = 0; + public void close() {} + public void flush() {} + public void write(int b) { + ret[idx++] = ((byte) b);} + public void write(byte[] b) { + for (int i = 0; i < b.length; i++) {ret[idx++] = b[i];} + } + public void write(byte[] b, int off, int l) { + for (int i = off; i < (off + l); i++) {ret[idx++] = b[i];} + } + })); + } catch (IOException e) { + throw new RuntimeException("unexpected error while serializing a bit map"); + } + return ret; + } @Override public String toString() { @@ -232,7 +275,7 @@ public String toString() { */ public void merge(BloomFilter that) { if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) { - this.bitSet.putAll(that.bitSet); + this.bitMap.or(that.bitMap); } else { throw new IllegalArgumentException("BloomFilters are not compatible for merging." + " this - " + this.toString() + " that - " + that.toString()); @@ -240,7 +283,7 @@ public void merge(BloomFilter that) { } public void reset() { - this.bitSet.clear(); + this.bitMap.clear(); } /** @@ -258,9 +301,8 @@ public static void serialize(OutputStream out, BloomFilter bloomFilter) throws I DataOutputStream dataOutputStream = new DataOutputStream(out); dataOutputStream.writeByte(bloomFilter.numHashFunctions); dataOutputStream.writeInt(bloomFilter.numBits); - for (long value : bloomFilter.getBitSet()) { - dataOutputStream.writeLong(value); - } + bloomFilter.bitMap.runOptimize(); + bloomFilter.bitMap.serialize(dataOutputStream); } /** @@ -280,11 +322,7 @@ public static BloomFilter deserialize(InputStream in) throws IOException { int numHashFunc = dataInputStream.readByte(); int numBits = dataInputStream.readInt(); int sz = (numBits/Long.SIZE); - List data = new ArrayList(); - for (int i = 0; i < sz; i++) { - data.add(dataInputStream.readLong()); - } - return new BloomFilter(data, numBits, numHashFunc); + return new BloomFilter(dataInputStream, numBits, numHashFunc); } catch (RuntimeException e) { IOException io = new IOException( "Unable to deserialize BloomFilter"); io.initCause(e); @@ -321,80 +359,16 @@ public static void mergeBloomFilterBytes( } } - // Just bitwise-OR the bits together - size/# functions should be the same, - // rest of the data is serialized long values for the bitset which are supposed to be bitwise-ORed. - for (int idx = START_OF_SERIALIZED_LONGS; idx < bf1Length; ++idx) { - bf1Bytes[bf1Start + idx] |= bf2Bytes[bf2Start + idx]; - } - } - - /** - * Bare metal bit set implementation. For performance reasons, this implementation does not check - * for index bounds nor expand the bit set size if the specified index is greater than the size. - */ - public class BitSet { - private final long[] data; - - public BitSet(long bits) { - this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]); - } - - /** - * Deserialize long array as bit set. - * - * @param data - bit array - */ - public BitSet(long[] data) { - assert data.length > 0 : "data length is zero!"; - this.data = data; - } - - /** - * Sets the bit at specified index. - * - * @param index - position - */ - public void set(int index) { - data[index >>> 6] |= (1L << index); - } - - /** - * Returns true if the bit is set in the specified index. - * - * @param index - position - * @return - value at the bit position - */ - public boolean get(int index) { - return (data[index >>> 6] & (1L << index)) != 0; - } - - /** - * Number of bits - */ - public long bitSize() { - return (long) data.length * Long.SIZE; - } + ByteArrayInputStream in1 = new ByteArrayInputStream(bf1Bytes); + ByteArrayInputStream in2 = new ByteArrayInputStream(bf2Bytes); - public long[] getData() { - return data; - } - - /** - * Combines the two BitArrays using bitwise OR. - */ - public void putAll(BitSet array) { - assert data.length == array.data.length : - "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")"; - for (int i = 0; i < data.length; i++) { - data[i] |= array.data[i]; - } - } - - /** - * Clear the bit set. - */ - public void clear() { - Arrays.fill(data, 0); + try { + BloomFilter bf1 = deserialize(in1); + BloomFilter bf2 = deserialize(in2); + bf1.merge(bf2); + } catch (IOException e) { + throw new IllegalArgumentException("Unable to deserialize BloomFilter"); + } } - } } + diff --git a/storage-api/src/test/org/apache/hive/common/util/TestBloomFilter.java b/storage-api/src/test/org/apache/hive/common/util/TestBloomFilter.java index e4ee93a014..a1ae201660 100644 --- a/storage-api/src/test/org/apache/hive/common/util/TestBloomFilter.java +++ b/storage-api/src/test/org/apache/hive/common/util/TestBloomFilter.java @@ -144,7 +144,7 @@ public void testBloomFilterBytes() { randVal[4] = 0; assertEquals(false, bf.test(randVal)); - assertEquals(7800, bf.sizeInBytes()); + assertEquals(105, bf.sizeInBytes()); } @Test @@ -190,7 +190,7 @@ public void testBloomFilterByte() { // most likely this value should not exist assertEquals(false, bf.testLong((byte) -120)); - assertEquals(7800, bf.sizeInBytes()); + assertEquals(74, bf.sizeInBytes()); } @Test @@ -236,7 +236,7 @@ public void testBloomFilterInt() { // most likely this value should not exist assertEquals(false, bf.testLong(-120)); - assertEquals(7800, bf.sizeInBytes()); + assertEquals(105, bf.sizeInBytes()); } @Test @@ -282,7 +282,7 @@ public void testBloomFilterLong() { // most likely this value should not exist assertEquals(false, bf.testLong(-120)); - assertEquals(7800, bf.sizeInBytes()); + assertEquals(105, bf.sizeInBytes()); } @Test @@ -328,7 +328,7 @@ public void testBloomFilterFloat() { // most likely this value should not exist assertEquals(false, bf.testDouble(-120.2f)); - assertEquals(7800, bf.sizeInBytes()); + assertEquals(105, bf.sizeInBytes()); } @Test @@ -374,7 +374,7 @@ public void testBloomFilterDouble() { // most likely this value should not exist assertEquals(false, bf.testDouble(-120.2d)); - assertEquals(7800, bf.sizeInBytes()); + assertEquals(105, bf.sizeInBytes()); } @Test @@ -420,7 +420,7 @@ public void testBloomFilterString() { // most likely this value should not exist assertEquals(false, bf.testString(Long.toString(-120))); - assertEquals(77944, bf.sizeInBytes()); + assertEquals(115, bf.sizeInBytes()); } @Test @@ -547,12 +547,12 @@ public void testMergeBloomFilterBytesFailureCases() throws Exception { BloomFilter bf1 = new BloomFilter(1000); BloomFilter bf2 = new BloomFilter(200); // Create bloom filter with same number of bits, but different # hash functions - ArrayList bits = new ArrayList(); - for (int idx = 0; idx < bf1.getBitSet().length; ++idx) { - bits.add(0L); + byte[] bf1Bits = bf1.getBitSet(); + byte[] bits = new byte[bf1Bits.length]; + for (int idx = 0; idx < bf1Bits.length; ++idx) { + bits[idx] = bf1Bits[idx]; } BloomFilter bf3 = new BloomFilter(bits, bf1.getBitSize(), bf1.getNumHashFunctions() + 1); - // Serialize to bytes ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); BloomFilter.serialize(bytesOut, bf1);