diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 65ec1b9..ebe2341 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -167,7 +167,17 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HiveConf.ConfVars.HIVE_TXN_MANAGER, HiveConf.ConfVars.HIVE_TXN_TIMEOUT, HiveConf.ConfVars.HIVE_TXN_MAX_OPEN_BATCH, - HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_ENABLED, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_SIZE, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_PARTITIONS, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_FPP, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_VARIANCE, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_TTL, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_FULL, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_CLEAN_UNTIL }; /** @@ -606,6 +616,29 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { "Choose whether dropping partitions with HCatClient pushes the partition-predicate to the metastore, " + "or drops partitions iteratively"), + METASTORE_AGGREGATE_STATS_CACHE_ENABLED("hive.metastore.aggregate.stats.cache.enabled", true, + "Whether aggregate stats caching is enabled or not."), + METASTORE_AGGREGATE_STATS_CACHE_SIZE("hive.metastore.aggregate.stats.cache.size", 10000, + "Maximum number of aggregate stats nodes that we will place in the metastore aggregate stats cache."), + METASTORE_AGGREGATE_STATS_CACHE_MAX_PARTITIONS("hive.metastore.aggregate.stats.cache.max.partitions", 10000, + "Maximum number of partitions that are aggregated per cache node."), + METASTORE_AGGREGATE_STATS_CACHE_FPP("hive.metastore.aggregate.stats.cache.fpp", (float) 0.01, + "Maximum false positive probability for the Bloom Filter used in each aggregate stats cache node (default 1%)."), + METASTORE_AGGREGATE_STATS_CACHE_MAX_VARIANCE("hive.metastore.aggregate.stats.cache.max.variance", (float) 0.01, + "Maximum tolerable variance in number of partitions between a cached node and our request (default 1%)."), + METASTORE_AGGREGATE_STATS_CACHE_TTL("hive.metastore.aggregate.stats.cache.ttl", "600s", new TimeValidator(TimeUnit.SECONDS), + "Number of seconds for a cached node to be active in the cache before they become stale."), + METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT("hive.metastore.aggregate.stats.cache.max.writer.wait", "5000ms", + new TimeValidator(TimeUnit.MILLISECONDS), + "Number of milliseconds a writer will wait to acquire the writelock before giving up."), + METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT("hive.metastore.aggregate.stats.cache.max.reader.wait", "1000ms", + new TimeValidator(TimeUnit.MILLISECONDS), + "Number of milliseconds a reader will wait to acquire the readlock before giving up."), + METASTORE_AGGREGATE_STATS_CACHE_MAX_FULL("hive.metastore.aggregate.stats.cache.max.full", (float) 0.9, + "Maximum cache full % after which the cache cleaner thread kicks in."), + METASTORE_AGGREGATE_STATS_CACHE_CLEAN_UNTIL("hive.metastore.aggregate.stats.cache.clean.until", (float) 0.8, + "The cleaner thread cleans until cache reaches this % full size."), + // Parameters for exporting metadata on table drop (requires the use of the) // org.apache.hadoop.hive.ql.parse.MetaDataExportListener preevent listener METADATA_EXPORT_LOCATION("hive.metadata.export.location", "", diff --git a/common/src/java/org/apache/hive/common/util/BloomFilter.java b/common/src/java/org/apache/hive/common/util/BloomFilter.java new file mode 100644 index 0000000..656ba8a --- /dev/null +++ b/common/src/java/org/apache/hive/common/util/BloomFilter.java @@ -0,0 +1,291 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +import static com.google.common.base.Preconditions.checkArgument; + +import java.util.Arrays; + +/** + * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are + * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of + * bloom filter false positive (element not present in bloom filter but test() says true) are + * possible but false negatives are not possible (if element is present then test() will never + * say false). The false positive probability is configurable (default: 5%) depending on which + * storage requirement may increase or decrease. Lower the false positive probability greater + * is the space requirement. + * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. + * During the creation of bloom filter expected number of entries must be specified. If the number + * of insertions exceed the specified initial number of entries then false positive probability will + * increase accordingly. + * + * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash + * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash + * collisions for specific sequence of repeating bytes. Check the following link for more info + * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw + */ +public class BloomFilter { + public static final double DEFAULT_FPP = 0.05; + protected BitSet bitSet; + protected int numBits; + protected int numHashFunctions; + + public BloomFilter() { + } + + public BloomFilter(long expectedEntries) { + this(expectedEntries, DEFAULT_FPP); + } + + public BloomFilter(long expectedEntries, double fpp) { + checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); + checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0"); + int nb = optimalNumOfBits(expectedEntries, fpp); + // make 'm' multiple of 64 + this.numBits = nb + (Long.SIZE - (nb % Long.SIZE)); + this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits); + this.bitSet = new BitSet(numBits); + } + + static int optimalNumOfHashFunctions(long n, long m) { + return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); + } + + static int optimalNumOfBits(long n, double p) { + return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); + } + + public void add(byte[] val) { + if (val == null) { + addBytes(val, -1); + } else { + addBytes(val, val.length); + } + } + + public void addBytes(byte[] val, int length) { + // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" + // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively + // implement a Bloom filter without any loss in the asymptotic false positive probability' + + // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned + // in the above paper + long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, length); + addHash(hash64); + } + + private void addHash(long hash64) { + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + + for (int i = 1; i <= numHashFunctions; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % numBits; + bitSet.set(pos); + } + } + + public void addString(String val) { + if (val == null) { + add(null); + } else { + add(val.getBytes()); + } + } + + public void addLong(long val) { + addHash(getLongHash(val)); + } + + public void addDouble(double val) { + addLong(Double.doubleToLongBits(val)); + } + + public boolean test(byte[] val) { + if (val == null) { + return testBytes(val, -1); + } + return testBytes(val, val.length); + } + + public boolean testBytes(byte[] val, int length) { + long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, length); + return testHash(hash64); + } + + private boolean testHash(long hash64) { + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + + for (int i = 1; i <= numHashFunctions; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % numBits; + if (!bitSet.get(pos)) { + return false; + } + } + return true; + } + + public boolean testString(String val) { + if (val == null) { + return test(null); + } else { + return test(val.getBytes()); + } + } + + public boolean testLong(long val) { + return testHash(getLongHash(val)); + } + + // Thomas Wang's integer hash function + // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm + private long getLongHash(long key) { + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ (key >> 28); + key = key + (key << 31); + return key; + } + + public boolean testDouble(double val) { + return testLong(Double.doubleToLongBits(val)); + } + + public long sizeInBytes() { + return getBitSize() / 8; + } + + public int getBitSize() { + return bitSet.getData().length * Long.SIZE; + } + + public int getNumHashFunctions() { + return numHashFunctions; + } + + public long[] getBitSet() { + return bitSet.getData(); + } + + @Override + public String toString() { + return "m: " + numBits + " k: " + numHashFunctions; + } + + /** + * Merge the specified bloom filter with current bloom filter. + * + * @param that - bloom filter to merge + */ + public void merge(BloomFilter that) { + if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) { + this.bitSet.putAll(that.bitSet); + } else { + throw new IllegalArgumentException("BloomFilters are not compatible for merging." + + " this - " + this.toString() + " that - " + that.toString()); + } + } + + public void reset() { + this.bitSet.clear(); + } + + /** + * Bare metal bit set implementation. For performance reasons, this implementation does not check + * for index bounds nor expand the bit set size if the specified index is greater than the size. + */ + public class BitSet { + private final long[] data; + + public BitSet(long bits) { + this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]); + } + + /** + * Deserialize long array as bit set. + * + * @param data - bit array + */ + public BitSet(long[] data) { + assert data.length > 0 : "data length is zero!"; + this.data = data; + } + + /** + * Sets the bit at specified index. + * + * @param index - position + */ + public void set(int index) { + data[index >>> 6] |= (1L << index); + } + + /** + * Returns true if the bit is set in the specified index. + * + * @param index - position + * @return - value at the bit position + */ + public boolean get(int index) { + return (data[index >>> 6] & (1L << index)) != 0; + } + + /** + * Number of bits + */ + public long bitSize() { + return (long) data.length * Long.SIZE; + } + + public long[] getData() { + return data; + } + + /** + * Combines the two BitArrays using bitwise OR. + */ + public void putAll(BitSet array) { + assert data.length == array.data.length : + "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")"; + for (int i = 0; i < data.length; i++) { + data[i] |= array.data[i]; + } + } + + /** + * Clear the bit set. + */ + public void clear() { + Arrays.fill(data, 0); + } + } +} diff --git a/common/src/java/org/apache/hive/common/util/Murmur3.java b/common/src/java/org/apache/hive/common/util/Murmur3.java new file mode 100644 index 0000000..087407a --- /dev/null +++ b/common/src/java/org/apache/hive/common/util/Murmur3.java @@ -0,0 +1,334 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +/** + * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms. + * + * Murmur3 32 and 128 bit variants. + * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 + * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 + * + * This is a public domain code with no copyrights. + * From homepage of MurmurHash (https://code.google.com/p/smhasher/), + * "All MurmurHash versions are public domain software, and the author disclaims all copyright + * to their code." + */ +public class Murmur3 { + // from 64-bit linear congruential generator + public static final long NULL_HASHCODE = 2862933555777941757L; + + // Constants for 32 bit variant + private static final int C1_32 = 0xcc9e2d51; + private static final int C2_32 = 0x1b873593; + private static final int R1_32 = 15; + private static final int R2_32 = 13; + private static final int M_32 = 5; + private static final int N_32 = 0xe6546b64; + + // Constants for 128 bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + private static final int DEFAULT_SEED = 104729; + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static int hash32(byte[] data) { + return hash32(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default 0) + * @return - hashcode + */ + public static int hash32(byte[] data, int length, int seed) { + int hash = seed; + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + int i_4 = i << 2; + int k = (data[i_4] & 0xff) + | ((data[i_4 + 1] & 0xff) << 8) + | ((data[i_4 + 2] & 0xff) << 16) + | ((data[i_4 + 3] & 0xff) << 24); + + // mix functions + k *= C1_32; + k = Integer.rotateLeft(k, R1_32); + k *= C2_32; + hash ^= k; + hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + } + + // tail + int idx = nblocks << 2; + int k1 = 0; + switch (length - idx) { + case 3: + k1 ^= data[idx + 2] << 16; + case 2: + k1 ^= data[idx + 1] << 8; + case 1: + k1 ^= data[idx]; + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + hash ^= k1; + } + + // finalization + hash ^= length; + hash ^= (hash >>> 16); + hash *= 0x85ebca6b; + hash ^= (hash >>> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >>> 16); + + return hash; + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static long hash64(byte[] data) { + return hash64(data, data.length, DEFAULT_SEED); + } + + public static long hash64(byte[] data, int length) { + return hash64(data, length, DEFAULT_SEED); + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode + */ + public static long hash64(byte[] data, int length, int seed) { + long hash = seed; + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int i8 = i << 3; + long k = ((long) data[i8] & 0xff) + | (((long) data[i8 + 1] & 0xff) << 8) + | (((long) data[i8 + 2] & 0xff) << 16) + | (((long) data[i8 + 3] & 0xff) << 24) + | (((long) data[i8 + 4] & 0xff) << 32) + | (((long) data[i8 + 5] & 0xff) << 40) + | (((long) data[i8 + 6] & 0xff) << 48) + | (((long) data[i8 + 7] & 0xff) << 56); + + // mix functions + k *= C1; + k = Long.rotateLeft(k, R1); + k *= C2; + hash ^= k; + hash = Long.rotateLeft(hash, R2) * M + N1; + } + + // tail + long k1 = 0; + int tailStart = nblocks << 3; + switch (length - tailStart) { + case 7: + k1 ^= ((long) data[tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= ((long) data[tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + } + + // finalization + hash ^= length; + hash = fmix64(hash); + + return hash; + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data) { + return hash128(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data, int length, int seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int i16 = i << 4; + long k1 = ((long) data[i16] & 0xff) + | (((long) data[i16 + 1] & 0xff) << 8) + | (((long) data[i16 + 2] & 0xff) << 16) + | (((long) data[i16 + 3] & 0xff) << 24) + | (((long) data[i16 + 4] & 0xff) << 32) + | (((long) data[i16 + 5] & 0xff) << 40) + | (((long) data[i16 + 6] & 0xff) << 48) + | (((long) data[i16 + 7] & 0xff) << 56); + + long k2 = ((long) data[i16 + 8] & 0xff) + | (((long) data[i16 + 9] & 0xff) << 8) + | (((long) data[i16 + 10] & 0xff) << 16) + | (((long) data[i16 + 11] & 0xff) << 24) + | (((long) data[i16 + 12] & 0xff) << 32) + | (((long) data[i16 + 13] & 0xff) << 40) + | (((long) data[i16 + 14] & 0xff) << 48) + | (((long) data[i16 + 15] & 0xff) << 56); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + int tailStart = nblocks << 4; + switch (length - tailStart) { + case 15: + k2 ^= (long) (data[tailStart + 14] & 0xff) << 48; + case 14: + k2 ^= (long) (data[tailStart + 13] & 0xff) << 40; + case 13: + k2 ^= (long) (data[tailStart + 12] & 0xff) << 32; + case 12: + k2 ^= (long) (data[tailStart + 11] & 0xff) << 24; + case 11: + k2 ^= (long) (data[tailStart + 10] & 0xff) << 16; + case 10: + k2 ^= (long) (data[tailStart + 9] & 0xff) << 8; + case 9: + k2 ^= (long) (data[tailStart + 8] & 0xff); + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= (long) (data[tailStart + 7] & 0xff) << 56; + case 7: + k1 ^= (long) (data[tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= (long) (data[tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= (long) (data[tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= (long) (data[tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= (long) (data[tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= (long) (data[tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= (long) (data[tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[]{h1, h2}; + } + + private static long fmix64(long h) { + h ^= (h >>> 33); + h *= 0xff51afd7ed558ccdL; + h ^= (h >>> 33); + h *= 0xc4ceb9fe1a85ec53L; + h ^= (h >>> 33); + return h; + } +} diff --git a/common/src/test/org/apache/hive/common/util/TestBloomFilter.java b/common/src/test/org/apache/hive/common/util/TestBloomFilter.java new file mode 100644 index 0000000..7c2a941 --- /dev/null +++ b/common/src/test/org/apache/hive/common/util/TestBloomFilter.java @@ -0,0 +1,458 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +import static org.junit.Assert.assertEquals; + +import java.util.Random; + +import org.junit.Test; + +/** + * + */ +public class TestBloomFilter { + private static final int COUNT = 100; + Random rand = new Random(123); + + @Test(expected = IllegalArgumentException.class) + public void testBloomIllegalArg1() { + BloomFilter bf = new BloomFilter(0, 0); + } + + @Test(expected = IllegalArgumentException.class) + public void testBloomIllegalArg2() { + BloomFilter bf = new BloomFilter(0, 0.1); + } + + @Test(expected = IllegalArgumentException.class) + public void testBloomIllegalArg3() { + BloomFilter bf = new BloomFilter(1, 0.0); + } + + @Test(expected = IllegalArgumentException.class) + public void testBloomIllegalArg4() { + BloomFilter bf = new BloomFilter(1, 1.0); + } + + @Test(expected = IllegalArgumentException.class) + public void testBloomIllegalArg5() { + BloomFilter bf = new BloomFilter(-1, -1); + } + + + @Test + public void testBloomNumBits() { + assertEquals(0, BloomFilter.optimalNumOfBits(0, 0)); + assertEquals(0, BloomFilter.optimalNumOfBits(0, 1)); + assertEquals(0, BloomFilter.optimalNumOfBits(1, 1)); + assertEquals(7, BloomFilter.optimalNumOfBits(1, 0.03)); + assertEquals(72, BloomFilter.optimalNumOfBits(10, 0.03)); + assertEquals(729, BloomFilter.optimalNumOfBits(100, 0.03)); + assertEquals(7298, BloomFilter.optimalNumOfBits(1000, 0.03)); + assertEquals(72984, BloomFilter.optimalNumOfBits(10000, 0.03)); + assertEquals(729844, BloomFilter.optimalNumOfBits(100000, 0.03)); + assertEquals(7298440, BloomFilter.optimalNumOfBits(1000000, 0.03)); + assertEquals(6235224, BloomFilter.optimalNumOfBits(1000000, 0.05)); + } + + @Test + public void testBloomNumHashFunctions() { + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(-1, -1)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(0, 0)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(10, 0)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(10, 10)); + assertEquals(7, BloomFilter.optimalNumOfHashFunctions(10, 100)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(100, 100)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(1000, 100)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(10000, 100)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(100000, 100)); + assertEquals(1, BloomFilter.optimalNumOfHashFunctions(1000000, 100)); + } + + @Test + public void testBloomFilterBytes() { + BloomFilter bf = new BloomFilter(10000); + byte[] val = new byte[]{1, 2, 3}; + byte[] val1 = new byte[]{1, 2, 3, 4}; + byte[] val2 = new byte[]{1, 2, 3, 4, 5}; + byte[] val3 = new byte[]{1, 2, 3, 4, 5, 6}; + + assertEquals(false, bf.test(val)); + assertEquals(false, bf.test(val1)); + assertEquals(false, bf.test(val2)); + assertEquals(false, bf.test(val3)); + bf.add(val); + assertEquals(true, bf.test(val)); + assertEquals(false, bf.test(val1)); + assertEquals(false, bf.test(val2)); + assertEquals(false, bf.test(val3)); + bf.add(val1); + assertEquals(true, bf.test(val)); + assertEquals(true, bf.test(val1)); + assertEquals(false, bf.test(val2)); + assertEquals(false, bf.test(val3)); + bf.add(val2); + assertEquals(true, bf.test(val)); + assertEquals(true, bf.test(val1)); + assertEquals(true, bf.test(val2)); + assertEquals(false, bf.test(val3)); + bf.add(val3); + assertEquals(true, bf.test(val)); + assertEquals(true, bf.test(val1)); + assertEquals(true, bf.test(val2)); + assertEquals(true, bf.test(val3)); + + byte[] randVal = new byte[COUNT]; + for (int i = 0; i < COUNT; i++) { + rand.nextBytes(randVal); + bf.add(randVal); + } + // last value should be present + assertEquals(true, bf.test(randVal)); + // most likely this value should not exist + randVal[0] = 0; + randVal[1] = 0; + randVal[2] = 0; + randVal[3] = 0; + randVal[4] = 0; + assertEquals(false, bf.test(randVal)); + + assertEquals(7800, bf.sizeInBytes()); + } + + @Test + public void testBloomFilterByte() { + BloomFilter bf = new BloomFilter(10000); + byte val = Byte.MIN_VALUE; + byte val1 = 1; + byte val2 = 2; + byte val3 = Byte.MAX_VALUE; + + assertEquals(false, bf.testLong(val)); + assertEquals(false, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val); + assertEquals(true, bf.testLong(val)); + assertEquals(false, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val1); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val2); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(true, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val3); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(true, bf.testLong(val2)); + assertEquals(true, bf.testLong(val3)); + + byte randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = (byte) rand.nextInt(Byte.MAX_VALUE); + bf.addLong(randVal); + } + // last value should be present + assertEquals(true, bf.testLong(randVal)); + // most likely this value should not exist + assertEquals(false, bf.testLong((byte) -120)); + + assertEquals(7800, bf.sizeInBytes()); + } + + @Test + public void testBloomFilterInt() { + BloomFilter bf = new BloomFilter(10000); + int val = Integer.MIN_VALUE; + int val1 = 1; + int val2 = 2; + int val3 = Integer.MAX_VALUE; + + assertEquals(false, bf.testLong(val)); + assertEquals(false, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val); + assertEquals(true, bf.testLong(val)); + assertEquals(false, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val1); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val2); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(true, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val3); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(true, bf.testLong(val2)); + assertEquals(true, bf.testLong(val3)); + + int randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextInt(); + bf.addLong(randVal); + } + // last value should be present + assertEquals(true, bf.testLong(randVal)); + // most likely this value should not exist + assertEquals(false, bf.testLong(-120)); + + assertEquals(7800, bf.sizeInBytes()); + } + + @Test + public void testBloomFilterLong() { + BloomFilter bf = new BloomFilter(10000); + long val = Long.MIN_VALUE; + long val1 = 1; + long val2 = 2; + long val3 = Long.MAX_VALUE; + + assertEquals(false, bf.testLong(val)); + assertEquals(false, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val); + assertEquals(true, bf.testLong(val)); + assertEquals(false, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val1); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(false, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val2); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(true, bf.testLong(val2)); + assertEquals(false, bf.testLong(val3)); + bf.addLong(val3); + assertEquals(true, bf.testLong(val)); + assertEquals(true, bf.testLong(val1)); + assertEquals(true, bf.testLong(val2)); + assertEquals(true, bf.testLong(val3)); + + long randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextLong(); + bf.addLong(randVal); + } + // last value should be present + assertEquals(true, bf.testLong(randVal)); + // most likely this value should not exist + assertEquals(false, bf.testLong(-120)); + + assertEquals(7800, bf.sizeInBytes()); + } + + @Test + public void testBloomFilterFloat() { + BloomFilter bf = new BloomFilter(10000); + float val = Float.MIN_VALUE; + float val1 = 1.1f; + float val2 = 2.2f; + float val3 = Float.MAX_VALUE; + + assertEquals(false, bf.testDouble(val)); + assertEquals(false, bf.testDouble(val1)); + assertEquals(false, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val); + assertEquals(true, bf.testDouble(val)); + assertEquals(false, bf.testDouble(val1)); + assertEquals(false, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val1); + assertEquals(true, bf.testDouble(val)); + assertEquals(true, bf.testDouble(val1)); + assertEquals(false, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val2); + assertEquals(true, bf.testDouble(val)); + assertEquals(true, bf.testDouble(val1)); + assertEquals(true, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val3); + assertEquals(true, bf.testDouble(val)); + assertEquals(true, bf.testDouble(val1)); + assertEquals(true, bf.testDouble(val2)); + assertEquals(true, bf.testDouble(val3)); + + float randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextFloat(); + bf.addDouble(randVal); + } + // last value should be present + assertEquals(true, bf.testDouble(randVal)); + // most likely this value should not exist + assertEquals(false, bf.testDouble(-120.2f)); + + assertEquals(7800, bf.sizeInBytes()); + } + + @Test + public void testBloomFilterDouble() { + BloomFilter bf = new BloomFilter(10000); + double val = Double.MIN_VALUE; + double val1 = 1.1d; + double val2 = 2.2d; + double val3 = Double.MAX_VALUE; + + assertEquals(false, bf.testDouble(val)); + assertEquals(false, bf.testDouble(val1)); + assertEquals(false, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val); + assertEquals(true, bf.testDouble(val)); + assertEquals(false, bf.testDouble(val1)); + assertEquals(false, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val1); + assertEquals(true, bf.testDouble(val)); + assertEquals(true, bf.testDouble(val1)); + assertEquals(false, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val2); + assertEquals(true, bf.testDouble(val)); + assertEquals(true, bf.testDouble(val1)); + assertEquals(true, bf.testDouble(val2)); + assertEquals(false, bf.testDouble(val3)); + bf.addDouble(val3); + assertEquals(true, bf.testDouble(val)); + assertEquals(true, bf.testDouble(val1)); + assertEquals(true, bf.testDouble(val2)); + assertEquals(true, bf.testDouble(val3)); + + double randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextDouble(); + bf.addDouble(randVal); + } + // last value should be present + assertEquals(true, bf.testDouble(randVal)); + // most likely this value should not exist + assertEquals(false, bf.testDouble(-120.2d)); + + assertEquals(7800, bf.sizeInBytes()); + } + + @Test + public void testBloomFilterString() { + BloomFilter bf = new BloomFilter(100000); + String val = "bloo"; + String val1 = "bloom fil"; + String val2 = "bloom filter"; + String val3 = "cuckoo filter"; + + assertEquals(false, bf.testString(val)); + assertEquals(false, bf.testString(val1)); + assertEquals(false, bf.testString(val2)); + assertEquals(false, bf.testString(val3)); + bf.addString(val); + assertEquals(true, bf.testString(val)); + assertEquals(false, bf.testString(val1)); + assertEquals(false, bf.testString(val2)); + assertEquals(false, bf.testString(val3)); + bf.addString(val1); + assertEquals(true, bf.testString(val)); + assertEquals(true, bf.testString(val1)); + assertEquals(false, bf.testString(val2)); + assertEquals(false, bf.testString(val3)); + bf.addString(val2); + assertEquals(true, bf.testString(val)); + assertEquals(true, bf.testString(val1)); + assertEquals(true, bf.testString(val2)); + assertEquals(false, bf.testString(val3)); + bf.addString(val3); + assertEquals(true, bf.testString(val)); + assertEquals(true, bf.testString(val1)); + assertEquals(true, bf.testString(val2)); + assertEquals(true, bf.testString(val3)); + + long randVal = 0; + for (int i = 0; i < COUNT; i++) { + randVal = rand.nextLong(); + bf.addString(Long.toString(randVal)); + } + // last value should be present + assertEquals(true, bf.testString(Long.toString(randVal))); + // most likely this value should not exist + assertEquals(false, bf.testString(Long.toString(-120))); + + assertEquals(77944, bf.sizeInBytes()); + } + + @Test + public void testMerge() { + BloomFilter bf = new BloomFilter(10000); + String val = "bloo"; + String val1 = "bloom fil"; + String val2 = "bloom filter"; + String val3 = "cuckoo filter"; + bf.addString(val); + bf.addString(val1); + bf.addString(val2); + bf.addString(val3); + + BloomFilter bf2 = new BloomFilter(10000); + String v = "2_bloo"; + String v1 = "2_bloom fil"; + String v2 = "2_bloom filter"; + String v3 = "2_cuckoo filter"; + bf2.addString(v); + bf2.addString(v1); + bf2.addString(v2); + bf2.addString(v3); + + assertEquals(true, bf.testString(val)); + assertEquals(true, bf.testString(val1)); + assertEquals(true, bf.testString(val2)); + assertEquals(true, bf.testString(val3)); + assertEquals(false, bf.testString(v)); + assertEquals(false, bf.testString(v1)); + assertEquals(false, bf.testString(v2)); + assertEquals(false, bf.testString(v3)); + + bf.merge(bf2); + + assertEquals(true, bf.testString(val)); + assertEquals(true, bf.testString(val1)); + assertEquals(true, bf.testString(val2)); + assertEquals(true, bf.testString(val3)); + assertEquals(true, bf.testString(v)); + assertEquals(true, bf.testString(v1)); + assertEquals(true, bf.testString(v2)); + assertEquals(true, bf.testString(v3)); + } +} diff --git a/common/src/test/org/apache/hive/common/util/TestMurmur3.java b/common/src/test/org/apache/hive/common/util/TestMurmur3.java new file mode 100644 index 0000000..e506f71 --- /dev/null +++ b/common/src/test/org/apache/hive/common/util/TestMurmur3.java @@ -0,0 +1,189 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +import static org.junit.Assert.assertEquals; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Random; + +/** + * Tests for Murmur3 variants. + */ +public class TestMurmur3 { + + @Test + public void testHashCodesM3_32_string() { + String key = "test"; + int seed = 123; + HashFunction hf = Hashing.murmur3_32(seed); + int hc1 = hf.hashBytes(key.getBytes()).asInt(); + int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); + assertEquals(hc1, hc2); + + key = "testkey"; + hc1 = hf.hashBytes(key.getBytes()).asInt(); + hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); + assertEquals(hc1, hc2); + } + + @Test + public void testHashCodesM3_32_ints() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_32(seed); + for (int i = 0; i < 1000; i++) { + int val = rand.nextInt(); + byte[] data = ByteBuffer.allocate(4).putInt(val).array(); + int hc1 = hf.hashBytes(data).asInt(); + int hc2 = Murmur3.hash32(data, data.length, seed); + assertEquals(hc1, hc2); + } + } + + @Test + public void testHashCodesM3_32_longs() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_32(seed); + for (int i = 0; i < 1000; i++) { + long val = rand.nextLong(); + byte[] data = ByteBuffer.allocate(8).putLong(val).array(); + int hc1 = hf.hashBytes(data).asInt(); + int hc2 = Murmur3.hash32(data, data.length, seed); + assertEquals(hc1, hc2); + } + } + + @Test + public void testHashCodesM3_32_double() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_32(seed); + for (int i = 0; i < 1000; i++) { + double val = rand.nextDouble(); + byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); + int hc1 = hf.hashBytes(data).asInt(); + int hc2 = Murmur3.hash32(data, data.length, seed); + assertEquals(hc1, hc2); + } + } + + @Test + public void testHashCodesM3_128_string() { + String key = "test"; + int seed = 123; + HashFunction hf = Hashing.murmur3_128(seed); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(key.getBytes()).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(key.getBytes(), key.getBytes().length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + + key = "testkey128_testkey128"; + buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(key.getBytes()).asBytes()); + buf.flip(); + gl1 = buf.getLong(); + gl2 = buf.getLong(8); + hc = Murmur3.hash128(key.getBytes(), key.getBytes().length, seed); + m1 = hc[0]; + m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + } + + @Test + public void testHashCodesM3_128_ints() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_128(seed); + for (int i = 0; i < 1000; i++) { + int val = rand.nextInt(); + byte[] data = ByteBuffer.allocate(4).putInt(val).array(); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(data).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(data, data.length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + } + } + + @Test + public void testHashCodesM3_128_longs() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_128(seed); + for (int i = 0; i < 1000; i++) { + long val = rand.nextLong(); + byte[] data = ByteBuffer.allocate(8).putLong(val).array(); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(data).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(data, data.length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + } + } + + @Test + public void testHashCodesM3_128_double() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_128(seed); + for (int i = 0; i < 1000; i++) { + double val = rand.nextDouble(); + byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(data).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(data, data.length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + } + } +} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/AggregateStatsCache.java b/metastore/src/java/org/apache/hadoop/hive/metastore/AggregateStatsCache.java new file mode 100644 index 0000000..6a85936 --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/AggregateStatsCache.java @@ -0,0 +1,575 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hive.common.util.BloomFilter; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +public class AggregateStatsCache { + + private static final Log LOG = LogFactory.getLog(AggregateStatsCache.class.getName()); + private static AggregateStatsCache self = null; + + // Backing store for this cache + private final ConcurrentHashMap cacheStore; + // Cache size + private final int maxCacheNodes; + // Current nodes in the cache + private AtomicInteger currentNodes = new AtomicInteger(0); + // Run the cleaner thread when the cache is maxFull% full + private final float maxFull; + // Run the cleaner thread until cache is cleanUntil% occupied + private final float cleanUntil; + // Nodes go stale after this + private final long timeToLive; + // Max time when waiting for write locks on node list + private final long maxWriterWaitTime; + // Max time when waiting for read locks on node list + private final long maxReaderWaitTime; + // Maximum number of paritions aggregated per cache node + private final int maxPartsPerCacheNode; + // Bloom filter false positive probability + private final float falsePositiveProbability; + // Max tolerable variance for matches + private final float maxVariance; + // Used to determine if cleaner thread is already running + private boolean isCleaning = false; + private AtomicLong cacheHits = new AtomicLong(0); + private AtomicLong cacheMisses = new AtomicLong(0); + // To track cleaner metrics + int numRemovedTTL = 0, numRemovedLRU = 0; + + private AggregateStatsCache(int maxCacheNodes, int maxPartsPerCacheNode, long timeToLive, + float falsePositiveProbability, float maxVariance, long maxWriterWaitTime, + long maxReaderWaitTime, float maxFull, float cleanUntil) { + this.maxCacheNodes = maxCacheNodes; + this.maxPartsPerCacheNode = maxPartsPerCacheNode; + this.timeToLive = timeToLive; + this.falsePositiveProbability = falsePositiveProbability; + this.maxVariance = maxVariance; + this.maxWriterWaitTime = maxWriterWaitTime; + this.maxReaderWaitTime = maxReaderWaitTime; + this.maxFull = maxFull; + this.cleanUntil = cleanUntil; + this.cacheStore = new ConcurrentHashMap(); + } + + public static synchronized AggregateStatsCache getInstance(Configuration conf) { + if (self == null) { + int maxCacheNodes = + HiveConf.getIntVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_SIZE); + // The number of partitions aggregated per cache node + // If the number of partitions requested is > this value, we'll fetch directly from Metastore + int maxPartitionsPerCacheNode = + HiveConf + .getIntVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_PARTITIONS); + long timeToLive = + HiveConf.getTimeVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_TTL, + TimeUnit.SECONDS); + // False positives probability we are ready to tolerate for the underlying bloom filter + float falsePositiveProbability = + HiveConf.getFloatVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_FPP); + // Maximum tolerable variance in number of partitions between cached node and our request + float maxVariance = + HiveConf + .getFloatVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_VARIANCE); + long maxWriterWaitTime = + HiveConf.getTimeVar(conf, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT, + TimeUnit.MILLISECONDS); + long maxReaderWaitTime = + HiveConf.getTimeVar(conf, + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT, + TimeUnit.MILLISECONDS); + float maxFull = + HiveConf.getFloatVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_FULL); + float cleanUntil = + HiveConf.getFloatVar(conf, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_CLEAN_UNTIL); + self = + new AggregateStatsCache(maxCacheNodes, maxPartitionsPerCacheNode, timeToLive, + falsePositiveProbability, maxVariance, maxWriterWaitTime, maxReaderWaitTime, maxFull, + cleanUntil); + } + return self; + } + + public int getMaxCacheNodes() { + return maxCacheNodes; + } + + public int getCurrentNodes() { + return currentNodes.intValue(); + } + + public float getFullPercent() { + return (currentNodes.intValue() / (float) maxCacheNodes) * 100; + } + + public int getMaxPartsPerCacheNode() { + return maxPartsPerCacheNode; + } + + public float getFalsePositiveProbability() { + return falsePositiveProbability; + } + + public Float getHitRatio() { + if (cacheHits.longValue() + cacheMisses.longValue() > 0) { + return (float) (cacheHits.longValue()) / (cacheHits.longValue() + cacheMisses.longValue()); + } + return null; + } + + /** + * Return aggregate stats for a column from the cache or null. + * While reading from the nodelist for a key, we wait maxReaderWaitTime to acquire the lock, + * failing which we return a cache miss (i.e. null) + * + * @param dbName + * @param tblName + * @param colName + * @param partNames + * @return + */ + public AggrColStats get(String dbName, String tblName, String colName, List partNames) { + // Cache key + Key key = new Key(dbName, tblName, colName); + AggrColStatsList candidateList = cacheStore.get(key); + // No key, or no nodes in candidate list + if ((candidateList == null) || (candidateList.nodes.size() == 0)) { + LOG.info("No aggregate stats cached for " + key.toString()); + return null; + } + // Find the value object + // Update the timestamp of the key,value if value matches the criteria + // Return the value + AggrColStats match = null; + boolean isLocked = false; + try { + // Try to readlock the candidateList; timeout after maxReaderWaitTime + isLocked = candidateList.readLock.tryLock(maxReaderWaitTime, TimeUnit.MILLISECONDS); + if (isLocked) { + match = findBestMatch(partNames, candidateList.nodes); + } + if (match != null) { + // Ok to not lock the list for this and use a volatile lastAccessTime instead + candidateList.updateLastAccessTime(); + cacheHits.incrementAndGet(); + LOG.info("Returning aggregate stats from the cache; total hits: " + cacheHits.longValue() + + ", total misses: " + cacheMisses.longValue() + ", hit ratio: " + getHitRatio()); + } + else { + cacheMisses.incrementAndGet(); + } + } catch (InterruptedException e) { + LOG.debug(e); + } finally { + if (isLocked) { + candidateList.readLock.unlock(); + } + } + return match; + } + + /** + * Find the best match using the configurable error tolerance and time to live value + * + * @param partNames + * @param candidates + * @return best matched node or null + */ + private AggrColStats findBestMatch(List partNames, List candidates) { + // Hits, misses, shouldSkip for a node + MatchStats matchStats; + // MatchStats for each candidate + Map candidateMatchStats = new HashMap(); + // The final match we intend to return + AggrColStats bestMatch = null; + // To compare among potentially multiple matches + int bestMatchHits = 0; + int numPartsRequested = partNames.size(); + // 1st pass at marking invalid candidates + // Checks based on variance and TTL + // Note: we're not creating a copy of the list for saving memory + for (AggrColStats candidate : candidates) { + // Variance check + if ((float) Math.abs((candidate.getNumPartsCached() - numPartsRequested) + / numPartsRequested) > maxVariance) { + continue; + } + // TTL check + if (isExpired(candidate)) { + continue; + } + else { + candidateMatchStats.put(candidate, new MatchStats(0, 0)); + } + } + // We'll count misses as we iterate + int maxMisses = (int) maxVariance * numPartsRequested; + for (String partName : partNames) { + for (AggrColStats candidate : candidates) { + matchStats = candidateMatchStats.get(candidate); + if (matchStats == null) { + continue; + } + if (candidate.getBloomFilter().test(partName.getBytes())) { + ++matchStats.hits; + } else { + ++matchStats.misses; + } + // 2nd pass at removing invalid candidates + // If misses so far exceed max tolerable misses + if (matchStats.misses > maxMisses) { + candidateMatchStats.remove(candidate); + continue; + } + // Check if this is the best match so far + if (matchStats.hits > bestMatchHits) { + bestMatch = candidate; + } + } + } + if (bestMatch != null) { + // Update the last access time for this node + bestMatch.updateLastAccessTime(); + } + return bestMatch; + } + + /** + * Add a new node to the cache; may trigger the cleaner thread if the cache is near full capacity. + * We'll however add the node even if we temporaily exceed maxCacheNodes, because the cleaner + * will eventually create space from expired nodes or by removing LRU nodes. + * + * @param dbName + * @param tblName + * @param colName + * @param numPartsCached + * @param colStats + * @param bloomFilter + */ + // TODO: make add asynchronous: add shouldn't block the higher level calls + public void add(String dbName, String tblName, String colName, long numPartsCached, + ColumnStatisticsObj colStats, BloomFilter bloomFilter) { + // If we have no space in the cache, run cleaner thread + if (getCurrentNodes() / maxCacheNodes > maxFull) { + spawnCleaner(); + } + // Cache key + Key key = new Key(dbName, tblName, colName); + // Add new node to the cache + AggrColStats node = new AggrColStats(numPartsCached, bloomFilter, colStats); + AggrColStatsList nodeList; + AggrColStatsList newNodeList = new AggrColStatsList(); + newNodeList.nodes = new ArrayList(); + nodeList = cacheStore.putIfAbsent(key, newNodeList); + if (nodeList == null) { + nodeList = newNodeList; + } + boolean isLocked = false; + try { + isLocked = nodeList.writeLock.tryLock(maxWriterWaitTime, TimeUnit.MILLISECONDS); + if (isLocked) { + nodeList.nodes.add(node); + node.updateLastAccessTime(); + nodeList.updateLastAccessTime(); + currentNodes.getAndIncrement(); + } + } catch (InterruptedException e) { + LOG.debug(e); + } finally { + if (isLocked) { + nodeList.writeLock.unlock(); + } + } + } + + /** + * Cleans the expired nodes or removes LRU nodes of the cache, + * until the cache size reduces to cleanUntil% full. + */ + private void spawnCleaner() { + // This spawns a separate thread to walk through the cache and removes expired nodes. + // Only one cleaner thread should be running at any point. + synchronized (this) { + if (isCleaning) { + return; + } + isCleaning = true; + } + Thread cleaner = new Thread("AggregateStatsCache-CleanerThread") { + @Override + public void run() { + numRemovedTTL = 0; + numRemovedLRU = 0; + long cleanerStartTime = System.currentTimeMillis(); + LOG.info("AggregateStatsCache is " + getFullPercent() + "% full, with " + + getCurrentNodes() + " nodes; starting cleaner thread"); + try { + Iterator> mapIterator = cacheStore.entrySet().iterator(); + while (mapIterator.hasNext()) { + Map.Entry pair = + (Map.Entry) mapIterator.next(); + AggrColStats node; + AggrColStatsList candidateList = (AggrColStatsList) pair.getValue(); + List nodes = candidateList.nodes; + if (nodes.size() == 0) { + mapIterator.remove(); + continue; + } + boolean isLocked = false; + try { + isLocked = candidateList.writeLock.tryLock(maxWriterWaitTime, TimeUnit.MILLISECONDS); + if (isLocked) { + for (Iterator listIterator = nodes.iterator(); listIterator.hasNext();) { + node = listIterator.next(); + // Remove the node if it has expired + if (isExpired(node)) { + listIterator.remove(); + numRemovedTTL++; + currentNodes.getAndDecrement(); + } + } + } + } catch (InterruptedException e) { + LOG.debug(e); + } finally { + if (isLocked) { + candidateList.writeLock.unlock(); + } + } + // We want to make sure this runs at a low priority in the background + Thread.yield(); + } + // If the expired nodes did not result in cache being cleanUntil% in size, + // start removing LRU nodes + while (getCurrentNodes() / maxCacheNodes > cleanUntil) { + evictOneNode(); + } + } finally { + isCleaning = false; + LOG.info("Stopping cleaner thread; AggregateStatsCache is now " + getFullPercent() + + "% full, with " + getCurrentNodes() + " nodes"); + LOG.info("Number of expired nodes removed: " + numRemovedTTL); + LOG.info("Number of LRU nodes removed: " + numRemovedLRU); + LOG.info("Cleaner ran for: " + (System.currentTimeMillis() - cleanerStartTime) + "ms"); + } + } + }; + cleaner.setPriority(Thread.MIN_PRIORITY); + cleaner.setDaemon(true); + cleaner.start(); + } + + /** + * Evict an LRU node or expired node whichever we find first + */ + private void evictOneNode() { + // Get the LRU key, value + Key lruKey = null; + AggrColStatsList lruValue = null; + for (Map.Entry entry : cacheStore.entrySet()) { + Key key = entry.getKey(); + AggrColStatsList value = entry.getValue(); + if (lruKey == null) { + lruKey = key; + lruValue = value; + continue; + } + if ((value.lastAccessTime < lruValue.lastAccessTime) && !(value.nodes.isEmpty())) { + lruKey = key; + lruValue = value; + } + } + // Now delete a node for this key's list + AggrColStatsList candidateList = cacheStore.get(lruKey); + boolean isLocked = false; + try { + isLocked = candidateList.writeLock.tryLock(maxWriterWaitTime, TimeUnit.MILLISECONDS); + if (isLocked) { + AggrColStats candidate; + AggrColStats lruNode = null; + int currentIndex = 0; + int deleteIndex = 0; + for (Iterator iterator = candidateList.nodes.iterator(); iterator.hasNext();) { + candidate = iterator.next(); + // Since we have to create space for 1, if we find an expired node we will remove it & + // return + if (isExpired(candidate)) { + iterator.remove(); + currentNodes.getAndDecrement(); + numRemovedTTL++; + return; + } + // Sorry, too many ifs but this form looks optimal + // Update the LRU node from what we've seen so far + if (lruNode == null) { + lruNode = candidate; + ++currentIndex; + continue; + } + if (lruNode != null) { + if (candidate.lastAccessTime < lruNode.lastAccessTime) { + lruNode = candidate; + deleteIndex = currentIndex; + } + } + } + candidateList.nodes.remove(deleteIndex); + currentNodes.getAndDecrement(); + numRemovedLRU++; + } + } catch (InterruptedException e) { + LOG.debug(e); + } finally { + if (isLocked) { + candidateList.writeLock.unlock(); + } + } + } + + private boolean isExpired(AggrColStats aggrColStats) { + return System.currentTimeMillis() - aggrColStats.lastAccessTime > timeToLive; + } + + /** + * Key object for the stats cache hashtable + */ + static class Key { + private final String dbName; + private final String tblName; + private final String colName; + + Key(String db, String table, String col) { + // Don't construct an illegal cache key + if ((db == null) || (table == null) || (col == null)) { + throw new IllegalArgumentException("dbName, tblName, colName can't be null"); + } + dbName = db; + tblName = table; + colName = col; + } + + @Override + public boolean equals(Object other) { + if ((other == null) || !(other instanceof Key)) { + return false; + } + Key that = (Key) other; + return dbName.equals(that.dbName) && tblName.equals(that.tblName) + && colName.equals(that.colName); + } + + @Override + public int hashCode() { + return dbName.hashCode() * 31 + tblName.hashCode() * 31 + colName.hashCode(); + } + + @Override + public String toString() { + return "Database: " + dbName + ", Table: " + tblName + ", Column: " + colName; + } + + } + + static class AggrColStatsList { + // TODO: figure out a better data structure for node list(?) + private List nodes = new ArrayList(); + private ReadWriteLock lock = new ReentrantReadWriteLock(); + // Read lock for get operation + private Lock readLock = lock.readLock(); + // Write lock for add, evict and clean operation + private Lock writeLock = lock.writeLock(); + // Using volatile instead of locking updates to this variable, + // since we can rely on approx lastAccessTime but don't want a performance hit + private volatile long lastAccessTime = 0; + + List getNodes() { + return nodes; + } + + void updateLastAccessTime() { + this.lastAccessTime = System.currentTimeMillis(); + } + } + + public static class AggrColStats { + private final long numPartsCached; + private final BloomFilter bloomFilter; + private final ColumnStatisticsObj colStats; + private volatile long lastAccessTime; + + public AggrColStats(long numPartsCached, BloomFilter bloomFilter, + ColumnStatisticsObj colStats) { + this.numPartsCached = numPartsCached; + this.bloomFilter = bloomFilter; + this.colStats = colStats; + this.lastAccessTime = System.currentTimeMillis(); + } + + public long getNumPartsCached() { + return numPartsCached; + } + + public ColumnStatisticsObj getColStats() { + return colStats; + } + + public BloomFilter getBloomFilter() { + return bloomFilter; + } + + void updateLastAccessTime() { + this.lastAccessTime = System.currentTimeMillis(); + } + } + + /** + * Intermediate object, used to collect hits & misses for each cache node that is evaluate for an + * incoming request + */ + private static class MatchStats { + private int hits = 0; + private int misses = 0; + + MatchStats(int hits, int misses) { + this.hits = hits; + this.misses = misses; + } + } +} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index bf169c9..5ef3b9a 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -43,6 +43,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats; import org.apache.hadoop.hive.metastore.api.AggrStats; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; @@ -69,6 +70,7 @@ import org.apache.hadoop.hive.metastore.parser.ExpressionTree.TreeNode; import org.apache.hadoop.hive.metastore.parser.ExpressionTree.TreeVisitor; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hive.common.util.BloomFilter; import org.datanucleus.store.rdbms.query.ForwardQueryResult; import com.google.common.collect.Lists; @@ -112,7 +114,8 @@ * Whether direct SQL can be used with the current datastore backing {@link #pm}. */ private final boolean isCompatibleDatastore; - + private final boolean isAggregateStatsCacheEnabled; + private AggregateStatsCache aggrStatsCache; public MetaStoreDirectSql(PersistenceManager pm, Configuration conf) { this.pm = pm; this.dbType = determineDbType(); @@ -129,6 +132,10 @@ public MetaStoreDirectSql(PersistenceManager pm, Configuration conf) { if (isCompatibleDatastore) { LOG.info("Using direct SQL, underlying DB is " + dbType); } + isAggregateStatsCacheEnabled = HiveConf.getBoolVar(conf, ConfVars.METASTORE_AGGREGATE_STATS_CACHE_ENABLED); + if (isAggregateStatsCacheEnabled) { + aggrStatsCache = AggregateStatsCache.getInstance(conf); + } } private DB determineDbType() { @@ -460,7 +467,7 @@ private boolean isViewTable(String dbName, String tblName) throws MetaException } else { result = getPartitionsFromPartitionIds(dbName, tblName, isView, sqlResult); } - + timingTrace(doTrace, queryText, start, queryTime); query.closeAll(); return result; @@ -803,7 +810,7 @@ private String extractSqlString(Object value) { if (value == null) return null; return value.toString(); } - + static Double extractSqlDouble(Object obj) throws MetaException { if (obj == null) return null; @@ -1091,14 +1098,54 @@ public ColumnStatistics getTableStats( } public AggrStats aggrColStatsForPartitions(String dbName, String tableName, - List partNames, List colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException { + List partNames, List colNames, boolean useDensityFunctionForNDVEstimation) + throws MetaException { long partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); - List stats = columnStatisticsObjForPartitions(dbName, - tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation); + List colStatsList; + // Try to read from the cache first + if (isAggregateStatsCacheEnabled) { + AggrColStats colStatsAggrCached; + List colStatsAggrFromDB; + int maxPartitionsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode(); + float falsePositiveProbability = aggrStatsCache.getFalsePositiveProbability(); + int partitionsRequested = partNames.size(); + if (partitionsRequested > maxPartitionsPerCacheNode) { + colStatsList = + columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, + useDensityFunctionForNDVEstimation); + } else { + colStatsList = new ArrayList(); + for (String colName : colNames) { + // Check the cache first + colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames); + if (colStatsAggrCached != null) { + colStatsList.add(colStatsAggrCached.getColStats()); + } else { + // Bloom filter for the new node that we will eventually add to the cache + BloomFilter bloomFilter = + new BloomFilter(maxPartitionsPerCacheNode, falsePositiveProbability); + List colNamesForDB = new ArrayList(); + colNamesForDB.add(colName); + // Read aggregated stats for one column + colStatsAggrFromDB = + columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, + partsFound, useDensityFunctionForNDVEstimation); + ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0); + colStatsList.add(colStatsAggr); + // Update the cache to add this new aggregate node + aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter); + } + } + } + } else { + colStatsList = + columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, + useDensityFunctionForNDVEstimation); + } LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " - + Arrays.toString(stats.toArray())); - return new AggrStats(stats, partsFound); + + Arrays.toString(colStatsList.toArray())); + return new AggrStats(colStatsList, partsFound); } private long partsFoundForPartitions(String dbName, String tableName, @@ -1421,7 +1468,7 @@ private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, avgDecimal, sumDist, useDensityFunctionForNDVEstimation); return cso; } - + private Object[] prepareParams(String dbName, String tableName, List partNames, List colNames) throws MetaException { @@ -1438,7 +1485,7 @@ private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, return params; } - + public List getPartitionStats(String dbName, String tableName, List partNames, List colNames) throws MetaException { if (colNames.isEmpty() || partNames.isEmpty()) { diff --git a/metastore/src/test/org/apache/hadoop/hive/metastore/TestAggregateStatsCache.java b/metastore/src/test/org/apache/hadoop/hive/metastore/TestAggregateStatsCache.java new file mode 100644 index 0000000..40700da --- /dev/null +++ b/metastore/src/test/org/apache/hadoop/hive/metastore/TestAggregateStatsCache.java @@ -0,0 +1,266 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; +import org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats; +import org.apache.hadoop.hive.metastore.AggregateStatsCache.Key; +import org.apache.hive.common.util.BloomFilter; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestAggregateStatsCache { + static String DB_NAME = "db"; + static String TAB_PREFIX = "tab"; + static String PART_PREFIX = "part"; + static String COL_PREFIX = "col"; + static int NUM_TABS = 2; + static int NUM_PARTS = 20; + static int NUM_COLS = 5; + static int MAX_CACHE_NODES = 10; + static int MAX_PARTITIONS_PER_CACHE_NODE = 10; + static String TIME_TO_LIVE = "20s"; + static String MAX_WRITER_WAIT = "1s"; + static String MAX_READER_WAIT = "1s"; + static float FALSE_POSITIVE_PROBABILITY = (float) 0.01; + static float MAX_VARIANCE = (float) 0.5; + static AggregateStatsCache cache; + static List tables = new ArrayList(); + static List tabParts = new ArrayList(); + static List tabCols = new ArrayList(); + + @BeforeClass + public static void beforeTest() { + // All data intitializations + initializeTables(); + initializePartitions(); + initializeColumns(); + } + + // tab1, tab2 + private static void initializeTables() { + for (int i = 1; i <= NUM_TABS; i++) { + tables.add(TAB_PREFIX + i); + } + } + + // part1 ... part20 + private static void initializePartitions() { + for (int i = 1; i <= NUM_PARTS; i++) { + tabParts.add(PART_PREFIX + i); + } + } + + // col1 ... col5 + private static void initializeColumns() { + for (int i = 1; i <= NUM_COLS; i++) { + tabCols.add(COL_PREFIX + i); + } + } + + @AfterClass + public static void afterTest() { + } + + @Before + public void setUp() { + HiveConf hiveConf = new HiveConf(); + hiveConf.setIntVar(HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_SIZE, + MAX_CACHE_NODES); + hiveConf.setIntVar(HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_PARTITIONS, + MAX_PARTITIONS_PER_CACHE_NODE); + hiveConf.setFloatVar( + HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_FPP, + FALSE_POSITIVE_PROBABILITY); + hiveConf.setFloatVar(HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_VARIANCE, + MAX_VARIANCE); + hiveConf.setVar(HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_TTL, TIME_TO_LIVE); + hiveConf.setVar(HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT, MAX_WRITER_WAIT); + hiveConf.setVar(HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT, MAX_READER_WAIT); + cache = AggregateStatsCache.getInstance(hiveConf); + } + + @After + public void tearDown() { + } + + @Test + public void testCacheKey() { + Key k1 = new Key("db", "tbl1", "col"); + Key k2 = new Key("db", "tbl1", "col"); + // k1 equals k2 + Assert.assertEquals(k1, k2); + Key k3 = new Key("db", "tbl2", "col"); + // k1 not equals k3 + Assert.assertNotEquals(k1, k3); + } + + @Test + public void testBasicAddAndGet() throws Exception { + // Partnames: [tab1part1...tab1part9] + List partNames = preparePartNames(tables.get(0), 1, 9); + // Prepare the bloom filter + BloomFilter bloomFilter = prepareBloomFilter(partNames); + // Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1 + String tblName = tables.get(0); + String colName = tabCols.get(0); + int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5; + // We'll treat this as the aggregate col stats for part1...part9 of tab1, col1 + ColumnStatisticsObj aggrColStats = + getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls); + // Now add to cache the dummy colstats for these 10 partitions + cache.add(DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter); + // Now get from cache + AggrColStats aggrStatsCached = cache.get(DB_NAME, tblName, colName, partNames); + Assert.assertNotNull(aggrStatsCached); + + ColumnStatisticsObj aggrColStatsCached = aggrStatsCached.getColStats(); + Assert.assertEquals(aggrColStats, aggrColStatsCached); + + // Now get a non-existant entry + aggrStatsCached = cache.get("dbNotThere", tblName, colName, partNames); + Assert.assertNull(aggrStatsCached); + } + + @Test + public void testAddGetWithVariance() throws Exception { + // Partnames: [tab1part1...tab1part9] + List partNames = preparePartNames(tables.get(0), 1, 9); + // Prepare the bloom filter + BloomFilter bloomFilter = prepareBloomFilter(partNames); + // Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1 + String tblName = tables.get(0); + String colName = tabCols.get(0); + int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5; + // We'll treat this as the aggregate col stats for part1...part9 of tab1, col1 + ColumnStatisticsObj aggrColStats = + getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls); + // Now add to cache + cache.add(DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter); + + // Now prepare partnames with only 5 partitions: [tab1part1...tab1part5] + partNames = preparePartNames(tables.get(0), 1, 5); + // This get should fail because its variance ((10-5)/5) is way past MAX_VARIANCE (0.5) + AggrColStats aggrStatsCached = cache.get(DB_NAME, tblName, colName, partNames); + Assert.assertNull(aggrStatsCached); + + // Now prepare partnames with 10 partitions: [tab1part11...tab1part20], but with no overlap + partNames = preparePartNames(tables.get(0), 11, 20); + // This get should fail because its variance ((10-0)/10) is way past MAX_VARIANCE (0.5) + aggrStatsCached = cache.get(DB_NAME, tblName, colName, partNames); + Assert.assertNull(aggrStatsCached); + + // Now prepare partnames with 9 partitions: [tab1part1...tab1part8], which are contained in the + // object that we added to the cache + partNames = preparePartNames(tables.get(0), 1, 8); + // This get should succeed because its variance ((10-9)/9) is within past MAX_VARIANCE (0.5) + aggrStatsCached = cache.get(DB_NAME, tblName, colName, partNames); + Assert.assertNotNull(aggrStatsCached); + ColumnStatisticsObj aggrColStatsCached = aggrStatsCached.getColStats(); + Assert.assertEquals(aggrColStats, aggrColStatsCached); + } + + @Test + public void testTimeToLive() throws Exception { + // Add a dummy node to cache + // Partnames: [tab1part1...tab1part9] + List partNames = preparePartNames(tables.get(0), 1, 9); + // Prepare the bloom filter + BloomFilter bloomFilter = prepareBloomFilter(partNames); + // Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1 + String tblName = tables.get(0); + String colName = tabCols.get(0); + int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5; + // We'll treat this as the aggregate col stats for part1...part9 of tab1, col1 + ColumnStatisticsObj aggrColStats = + getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls); + // Now add to cache + cache.add(DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter); + + // Sleep for 30 seconds + Thread.sleep(30000); + + // Get should fail now (since TTL is 20s) and we've snoozed for 30 seconds + AggrColStats aggrStatsCached = cache.get(DB_NAME, tblName, colName, partNames); + Assert.assertNull(aggrStatsCached); + } + + /** + * Prepares an array of partition names by getting partitions from minPart ... maxPart and + * prepending with table name + * Example: [tab1part1, tab1part2 ...] + * + * @param tabName + * @param minPart + * @param maxPart + * @return + * @throws Exception + */ + private List preparePartNames(String tabName, int minPart, int maxPart) throws Exception { + if ((minPart < 1) || (maxPart > NUM_PARTS)) { + throw new Exception("tabParts does not have these partition numbers"); + } + List partNames = new ArrayList(); + for (int i = minPart; i <= maxPart; i++) { + String partName = tabParts.get(i-1); + partNames.add(tabName + partName); + } + return partNames; + } + + /** + * Prepares a bloom filter from the list of partition names + * @param partNames + * @return + */ + private BloomFilter prepareBloomFilter(List partNames) { + BloomFilter bloomFilter = + new BloomFilter(MAX_PARTITIONS_PER_CACHE_NODE, FALSE_POSITIVE_PROBABILITY); + for (String partName: partNames) { + bloomFilter.add(partName.getBytes()); + } + return bloomFilter; + } + + private ColumnStatisticsObj getDummyLongColStat(String colName, int highVal, int lowVal, int numDVs, int numNulls) { + ColumnStatisticsObj aggrColStats = new ColumnStatisticsObj(); + aggrColStats.setColName(colName); + aggrColStats.setColType("long"); + LongColumnStatsData longStatsData = new LongColumnStatsData(); + longStatsData.setHighValue(highVal); + longStatsData.setLowValue(lowVal); + longStatsData.setNumDVs(numDVs); + longStatsData.setNumNulls(numNulls); + ColumnStatisticsData aggrColStatsData = new ColumnStatisticsData(); + aggrColStatsData.setLongStats(longStatsData); + aggrColStats.setStatsData(aggrColStatsData); + return aggrColStats; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/filters/BloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/filters/BloomFilter.java deleted file mode 100644 index 6ab0270..0000000 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/filters/BloomFilter.java +++ /dev/null @@ -1,298 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.filters; - -import static com.google.common.base.Preconditions.checkArgument; - -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.io.orc.OrcProto; - -import com.google.common.primitives.Longs; - -/** - * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are - * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of - * bloom filter false positive (element not present in bloom filter but test() says true) are - * possible but false negatives are not possible (if element is present then test() will never - * say false). The false positive probability is configurable (default: 5%) depending on which - * storage requirement may increase or decrease. Lower the false positive probability greater - * is the space requirement. - * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. - * During the creation of bloom filter expected number of entries must be specified. If the number - * of insertions exceed the specified initial number of entries then false positive probability will - * increase accordingly. - * - * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash - * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash - * collisions for specific sequence of repeating bytes. Check the following link for more info - * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw - */ -public class BloomFilter { - public static final double DEFAULT_FPP = 0.05; - private BitSet bitSet; - private int m; - private int k; - - public BloomFilter(long expectedEntries) { - this(expectedEntries, DEFAULT_FPP); - } - - public BloomFilter(long expectedEntries, double fpp) { - checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); - checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0"); - int nb = optimalNumOfBits(expectedEntries, fpp); - // make 'm' multiple of 64 - this.m = nb + (Long.SIZE - (nb % Long.SIZE)); - this.k = optimalNumOfHashFunctions(expectedEntries, m); - this.bitSet = new BitSet(m); - } - - public BloomFilter(OrcProto.BloomFilter bloomFilter) { - this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList())); - this.k = bloomFilter.getNumHashFunctions(); - this.m = (int) this.bitSet.bitSize(); - } - - static int optimalNumOfHashFunctions(long n, long m) { - return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); - } - - static int optimalNumOfBits(long n, double p) { - return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); - } - - public void add(byte[] val) { - if (val == null) { - addBytes(val, -1); - } else { - addBytes(val, val.length); - } - } - - public void addBytes(byte[] val, int length) { - // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" - // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively - // implement a Bloom filter without any loss in the asymptotic false positive probability' - - // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned - // in the above paper - long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, length); - addHash(hash64); - } - - private void addHash(long hash64) { - int hash1 = (int) hash64; - int hash2 = (int) (hash64 >>> 32); - - for (int i = 1; i <= k; i++) { - int combinedHash = hash1 + (i * hash2); - // hashcode should be positive, flip all the bits if it's negative - if (combinedHash < 0) { - combinedHash = ~combinedHash; - } - int pos = combinedHash % m; - bitSet.set(pos); - } - } - - public void addString(String val) { - if (val == null) { - add(null); - } else { - add(val.getBytes()); - } - } - - public void addLong(long val) { - addHash(getLongHash(val)); - } - - public void addDouble(double val) { - addLong(Double.doubleToLongBits(val)); - } - - public boolean test(byte[] val) { - if (val == null) { - return testBytes(val, -1); - } - return testBytes(val, val.length); - } - - public boolean testBytes(byte[] val, int length) { - long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, length); - return testHash(hash64); - } - - private boolean testHash(long hash64) { - int hash1 = (int) hash64; - int hash2 = (int) (hash64 >>> 32); - - for (int i = 1; i <= k; i++) { - int combinedHash = hash1 + (i * hash2); - // hashcode should be positive, flip all the bits if it's negative - if (combinedHash < 0) { - combinedHash = ~combinedHash; - } - int pos = combinedHash % m; - if (!bitSet.get(pos)) { - return false; - } - } - return true; - } - - public boolean testString(String val) { - if (val == null) { - return test(null); - } else { - return test(val.getBytes()); - } - } - - public boolean testLong(long val) { - return testHash(getLongHash(val)); - } - - // Thomas Wang's integer hash function - // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm - private long getLongHash(long key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ (key >> 28); - key = key + (key << 31); - return key; - } - - public boolean testDouble(double val) { - return testLong(Double.doubleToLongBits(val)); - } - - public long sizeInBytes() { - return getBitSize() / 8; - } - - public int getBitSize() { - return bitSet.getData().length * Long.SIZE; - } - - public int getNumHashFunctions() { - return k; - } - - public long[] getBitSet() { - return bitSet.getData(); - } - - @Override - public String toString() { - return "m: " + m + " k: " + k; - } - - /** - * Merge the specified bloom filter with current bloom filter. - * - * @param that - bloom filter to merge - */ - public void merge(BloomFilter that) { - if (this != that && this.m == that.m && this.k == that.k) { - this.bitSet.putAll(that.bitSet); - } else { - throw new IllegalArgumentException("BloomFilters are not compatible for merging." + - " this - " + this.toString() + " that - " + that.toString()); - } - } - - public void reset() { - this.bitSet.clear(); - } - - /** - * Bare metal bit set implementation. For performance reasons, this implementation does not check - * for index bounds nor expand the bit set size if the specified index is greater than the size. - */ - private class BitSet { - final long[] data; - - BitSet(long bits) { - this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]); - } - - /** - * Deserialize long array as bit set. - * - * @param data - bit array - */ - BitSet(long[] data) { - assert data.length > 0 : "data length is zero!"; - this.data = data; - } - - /** - * Sets the bit at specified index. - * - * @param index - position - */ - void set(int index) { - data[index >>> 6] |= (1L << index); - } - - /** - * Returns true if the bit is set in the specified index. - * - * @param index - position - * @return - value at the bit position - */ - boolean get(int index) { - return (data[index >>> 6] & (1L << index)) != 0; - } - - /** - * Number of bits - */ - long bitSize() { - return (long) data.length * Long.SIZE; - } - - long[] getData() { - return data; - } - - /** - * Combines the two BitArrays using bitwise OR. - */ - void putAll(BitSet array) { - assert data.length == array.data.length : - "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")"; - for (int i = 0; i < data.length; i++) { - data[i] |= array.data[i]; - } - } - - /** - * Clear the bit set. - */ - public void clear() { - Arrays.fill(data, 0); - } - } -} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/filters/BloomFilterIO.java b/ql/src/java/org/apache/hadoop/hive/ql/io/filters/BloomFilterIO.java new file mode 100644 index 0000000..56aec9f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/filters/BloomFilterIO.java @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.filters; + +import org.apache.hadoop.hive.ql.io.orc.OrcProto; +import org.apache.hive.common.util.BloomFilter; + +import com.google.common.primitives.Longs; + +public class BloomFilterIO extends BloomFilter { + + public BloomFilterIO(long expectedEntries) { + super(expectedEntries, DEFAULT_FPP); + } + + public BloomFilterIO(long expectedEntries, double fpp) { + super(expectedEntries, fpp); + } + +/** + * Initializes the BloomFilter from the given Orc BloomFilter + */ + public BloomFilterIO(OrcProto.BloomFilter bloomFilter) { + this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList())); + this.numHashFunctions = bloomFilter.getNumHashFunctions(); + this.numBits = (int) this.bitSet.bitSize(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/filters/Murmur3.java b/ql/src/java/org/apache/hadoop/hive/ql/io/filters/Murmur3.java deleted file mode 100644 index e733892..0000000 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/filters/Murmur3.java +++ /dev/null @@ -1,334 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.filters; - -/** - * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms. - * - * Murmur3 32 and 128 bit variants. - * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 - * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 - * - * This is a public domain code with no copyrights. - * From homepage of MurmurHash (https://code.google.com/p/smhasher/), - * "All MurmurHash versions are public domain software, and the author disclaims all copyright - * to their code." - */ -public class Murmur3 { - // from 64-bit linear congruential generator - public static final long NULL_HASHCODE = 2862933555777941757L; - - // Constants for 32 bit variant - private static final int C1_32 = 0xcc9e2d51; - private static final int C2_32 = 0x1b873593; - private static final int R1_32 = 15; - private static final int R2_32 = 13; - private static final int M_32 = 5; - private static final int N_32 = 0xe6546b64; - - // Constants for 128 bit variant - private static final long C1 = 0x87c37b91114253d5L; - private static final long C2 = 0x4cf5ad432745937fL; - private static final int R1 = 31; - private static final int R2 = 27; - private static final int R3 = 33; - private static final int M = 5; - private static final int N1 = 0x52dce729; - private static final int N2 = 0x38495ab5; - - private static final int DEFAULT_SEED = 104729; - - /** - * Murmur3 32-bit variant. - * - * @param data - input byte array - * @return - hashcode - */ - public static int hash32(byte[] data) { - return hash32(data, data.length, DEFAULT_SEED); - } - - /** - * Murmur3 32-bit variant. - * - * @param data - input byte array - * @param length - length of array - * @param seed - seed. (default 0) - * @return - hashcode - */ - public static int hash32(byte[] data, int length, int seed) { - int hash = seed; - final int nblocks = length >> 2; - - // body - for (int i = 0; i < nblocks; i++) { - int i_4 = i << 2; - int k = (data[i_4] & 0xff) - | ((data[i_4 + 1] & 0xff) << 8) - | ((data[i_4 + 2] & 0xff) << 16) - | ((data[i_4 + 3] & 0xff) << 24); - - // mix functions - k *= C1_32; - k = Integer.rotateLeft(k, R1_32); - k *= C2_32; - hash ^= k; - hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; - } - - // tail - int idx = nblocks << 2; - int k1 = 0; - switch (length - idx) { - case 3: - k1 ^= data[idx + 2] << 16; - case 2: - k1 ^= data[idx + 1] << 8; - case 1: - k1 ^= data[idx]; - - // mix functions - k1 *= C1_32; - k1 = Integer.rotateLeft(k1, R1_32); - k1 *= C2_32; - hash ^= k1; - } - - // finalization - hash ^= length; - hash ^= (hash >>> 16); - hash *= 0x85ebca6b; - hash ^= (hash >>> 13); - hash *= 0xc2b2ae35; - hash ^= (hash >>> 16); - - return hash; - } - - /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. - * - * @param data - input byte array - * @return - hashcode - */ - public static long hash64(byte[] data) { - return hash64(data, data.length, DEFAULT_SEED); - } - - public static long hash64(byte[] data, int length) { - return hash64(data, length, DEFAULT_SEED); - } - - /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. - * - * @param data - input byte array - * @param length - length of array - * @param seed - seed. (default is 0) - * @return - hashcode - */ - public static long hash64(byte[] data, int length, int seed) { - long hash = seed; - final int nblocks = length >> 3; - - // body - for (int i = 0; i < nblocks; i++) { - final int i8 = i << 3; - long k = ((long) data[i8] & 0xff) - | (((long) data[i8 + 1] & 0xff) << 8) - | (((long) data[i8 + 2] & 0xff) << 16) - | (((long) data[i8 + 3] & 0xff) << 24) - | (((long) data[i8 + 4] & 0xff) << 32) - | (((long) data[i8 + 5] & 0xff) << 40) - | (((long) data[i8 + 6] & 0xff) << 48) - | (((long) data[i8 + 7] & 0xff) << 56); - - // mix functions - k *= C1; - k = Long.rotateLeft(k, R1); - k *= C2; - hash ^= k; - hash = Long.rotateLeft(hash, R2) * M + N1; - } - - // tail - long k1 = 0; - int tailStart = nblocks << 3; - switch (length - tailStart) { - case 7: - k1 ^= ((long) data[tailStart + 6] & 0xff) << 48; - case 6: - k1 ^= ((long) data[tailStart + 5] & 0xff) << 40; - case 5: - k1 ^= ((long) data[tailStart + 4] & 0xff) << 32; - case 4: - k1 ^= ((long) data[tailStart + 3] & 0xff) << 24; - case 3: - k1 ^= ((long) data[tailStart + 2] & 0xff) << 16; - case 2: - k1 ^= ((long) data[tailStart + 1] & 0xff) << 8; - case 1: - k1 ^= ((long) data[tailStart] & 0xff); - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - hash ^= k1; - } - - // finalization - hash ^= length; - hash = fmix64(hash); - - return hash; - } - - /** - * Murmur3 128-bit variant. - * - * @param data - input byte array - * @return - hashcode (2 longs) - */ - public static long[] hash128(byte[] data) { - return hash128(data, data.length, DEFAULT_SEED); - } - - /** - * Murmur3 128-bit variant. - * - * @param data - input byte array - * @param length - length of array - * @param seed - seed. (default is 0) - * @return - hashcode (2 longs) - */ - public static long[] hash128(byte[] data, int length, int seed) { - long h1 = seed; - long h2 = seed; - final int nblocks = length >> 4; - - // body - for (int i = 0; i < nblocks; i++) { - final int i16 = i << 4; - long k1 = ((long) data[i16] & 0xff) - | (((long) data[i16 + 1] & 0xff) << 8) - | (((long) data[i16 + 2] & 0xff) << 16) - | (((long) data[i16 + 3] & 0xff) << 24) - | (((long) data[i16 + 4] & 0xff) << 32) - | (((long) data[i16 + 5] & 0xff) << 40) - | (((long) data[i16 + 6] & 0xff) << 48) - | (((long) data[i16 + 7] & 0xff) << 56); - - long k2 = ((long) data[i16 + 8] & 0xff) - | (((long) data[i16 + 9] & 0xff) << 8) - | (((long) data[i16 + 10] & 0xff) << 16) - | (((long) data[i16 + 11] & 0xff) << 24) - | (((long) data[i16 + 12] & 0xff) << 32) - | (((long) data[i16 + 13] & 0xff) << 40) - | (((long) data[i16 + 14] & 0xff) << 48) - | (((long) data[i16 + 15] & 0xff) << 56); - - // mix functions for k1 - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - h1 ^= k1; - h1 = Long.rotateLeft(h1, R2); - h1 += h2; - h1 = h1 * M + N1; - - // mix functions for k2 - k2 *= C2; - k2 = Long.rotateLeft(k2, R3); - k2 *= C1; - h2 ^= k2; - h2 = Long.rotateLeft(h2, R1); - h2 += h1; - h2 = h2 * M + N2; - } - - // tail - long k1 = 0; - long k2 = 0; - int tailStart = nblocks << 4; - switch (length - tailStart) { - case 15: - k2 ^= (long) (data[tailStart + 14] & 0xff) << 48; - case 14: - k2 ^= (long) (data[tailStart + 13] & 0xff) << 40; - case 13: - k2 ^= (long) (data[tailStart + 12] & 0xff) << 32; - case 12: - k2 ^= (long) (data[tailStart + 11] & 0xff) << 24; - case 11: - k2 ^= (long) (data[tailStart + 10] & 0xff) << 16; - case 10: - k2 ^= (long) (data[tailStart + 9] & 0xff) << 8; - case 9: - k2 ^= (long) (data[tailStart + 8] & 0xff); - k2 *= C2; - k2 = Long.rotateLeft(k2, R3); - k2 *= C1; - h2 ^= k2; - - case 8: - k1 ^= (long) (data[tailStart + 7] & 0xff) << 56; - case 7: - k1 ^= (long) (data[tailStart + 6] & 0xff) << 48; - case 6: - k1 ^= (long) (data[tailStart + 5] & 0xff) << 40; - case 5: - k1 ^= (long) (data[tailStart + 4] & 0xff) << 32; - case 4: - k1 ^= (long) (data[tailStart + 3] & 0xff) << 24; - case 3: - k1 ^= (long) (data[tailStart + 2] & 0xff) << 16; - case 2: - k1 ^= (long) (data[tailStart + 1] & 0xff) << 8; - case 1: - k1 ^= (long) (data[tailStart] & 0xff); - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - h1 ^= k1; - } - - // finalization - h1 ^= length; - h2 ^= length; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return new long[]{h1, h2}; - } - - private static long fmix64(long h) { - h ^= (h >>> 33); - h *= 0xff51afd7ed558ccdL; - h ^= (h >>> 33); - h *= 0xc4ceb9fe1a85ec53L; - h ^= (h >>> 33); - return h; - } -} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index 7bfd781..cd4db75 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -33,7 +33,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.filters.BloomFilter; +import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; import org.apache.hadoop.hive.serde2.io.ByteWritable; @@ -197,12 +197,12 @@ private static void printMetaData(List files, Configuration conf, private static String getFormattedBloomFilters(int col, OrcProto.BloomFilterIndex[] bloomFilterIndex) { StringBuilder buf = new StringBuilder(); - BloomFilter stripeLevelBF = null; + BloomFilterIO stripeLevelBF = null; if (bloomFilterIndex != null && bloomFilterIndex[col] != null) { int idx = 0; buf.append("\n Bloom filters for column ").append(col).append(":"); for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { - BloomFilter toMerge = new BloomFilter(bf); + BloomFilterIO toMerge = new BloomFilterIO(bf); buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge)); if (stripeLevelBF == null) { stripeLevelBF = toMerge; @@ -216,7 +216,7 @@ private static String getFormattedBloomFilters(int col, return buf.toString(); } - private static String getBloomFilterStats(BloomFilter bf) { + private static String getBloomFilterStats(BloomFilterIO bf) { StringBuilder sb = new StringBuilder(); int bitCount = bf.getBitSize(); int popCount = 0; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java index 49a8e80..61ee8b9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java @@ -32,7 +32,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.io.filters.BloomFilter; +import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; /** @@ -301,7 +301,7 @@ public static Reader createReader(Path path, paddingTolerance = conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname, HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal); - bloomFilterFpp = BloomFilter.DEFAULT_FPP; + bloomFilterFpp = BloomFilterIO.DEFAULT_FPP; } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index bde9fc2..9e7ac4b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -45,7 +45,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.filters.BloomFilter; +import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils.ByteBufferAllocatorPool; import org.apache.hadoop.hive.ql.io.orc.TreeReaderFactory.TreeReader; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; @@ -333,9 +333,9 @@ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto, ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto); Object minValue = getMin(cs); Object maxValue = getMax(cs); - BloomFilter bf = null; + BloomFilterIO bf = null; if (bloomFilter != null) { - bf = new BloomFilter(bloomFilter); + bf = new BloomFilterIO(bloomFilter); } return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf); } @@ -349,14 +349,14 @@ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto, * predicate. */ static TruthValue evaluatePredicate(ColumnStatistics stats, - PredicateLeaf predicate, BloomFilter bloomFilter) { + PredicateLeaf predicate, BloomFilterIO bloomFilter) { Object minValue = getMin(stats); Object maxValue = getMax(stats); return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter); } static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min, - Object max, boolean hasNull, BloomFilter bloomFilter) { + Object max, boolean hasNull, BloomFilterIO bloomFilter) { // if we didn't have any values, everything must have been null if (min == null) { if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) { @@ -490,7 +490,7 @@ private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Objec } private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate, Object predObj, - BloomFilter bloomFilter, boolean hasNull) { + BloomFilterIO bloomFilter, boolean hasNull) { switch (predicate.getOperator()) { case NULL_SAFE_EQUALS: // null safe equals does not return *_NULL variant. So set hasNull to false @@ -511,7 +511,7 @@ private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate, } } - private static TruthValue checkInBloomFilter(BloomFilter bf, Object predObj, boolean hasNull) { + private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) { TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO; if (predObj instanceof Long) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java index a319204..d8ecf88 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -42,7 +42,7 @@ import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOConstants; -import org.apache.hadoop.hive.ql.io.filters.BloomFilter; +import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; import org.apache.hadoop.hive.ql.io.orc.CompressionCodec.Modifier; import org.apache.hadoop.hive.ql.io.orc.OrcFile.CompressionStrategy; import org.apache.hadoop.hive.ql.io.orc.OrcFile.EncodingStrategy; @@ -638,7 +638,7 @@ public Configuration getConfiguration() { private final OrcProto.RowIndexEntry.Builder rowIndexEntry; private final PositionedOutputStream rowIndexStream; private final PositionedOutputStream bloomFilterStream; - protected final BloomFilter bloomFilter; + protected final BloomFilterIO bloomFilter; protected final boolean createBloomFilter; private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex; private final OrcProto.BloomFilter.Builder bloomFilterEntry; @@ -686,7 +686,7 @@ public Configuration getConfiguration() { bloomFilterEntry = OrcProto.BloomFilter.newBuilder(); bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder(); bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Kind.BLOOM_FILTER); - bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(), + bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(), streamFactory.getBloomFilterFPP()); } else { bloomFilterEntry = null; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/filters/TestBloomFilter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/filters/TestBloomFilter.java deleted file mode 100644 index 32b95ab..0000000 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/filters/TestBloomFilter.java +++ /dev/null @@ -1,458 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.filters; - -import static org.junit.Assert.assertEquals; - -import java.util.Random; - -import org.junit.Test; - -/** - * - */ -public class TestBloomFilter { - private static final int COUNT = 100; - Random rand = new Random(123); - - @Test(expected = IllegalArgumentException.class) - public void testBloomIllegalArg1() { - BloomFilter bf = new BloomFilter(0, 0); - } - - @Test(expected = IllegalArgumentException.class) - public void testBloomIllegalArg2() { - BloomFilter bf = new BloomFilter(0, 0.1); - } - - @Test(expected = IllegalArgumentException.class) - public void testBloomIllegalArg3() { - BloomFilter bf = new BloomFilter(1, 0.0); - } - - @Test(expected = IllegalArgumentException.class) - public void testBloomIllegalArg4() { - BloomFilter bf = new BloomFilter(1, 1.0); - } - - @Test(expected = IllegalArgumentException.class) - public void testBloomIllegalArg5() { - BloomFilter bf = new BloomFilter(-1, -1); - } - - - @Test - public void testBloomNumBits() { - assertEquals(0, BloomFilter.optimalNumOfBits(0, 0)); - assertEquals(0, BloomFilter.optimalNumOfBits(0, 1)); - assertEquals(0, BloomFilter.optimalNumOfBits(1, 1)); - assertEquals(7, BloomFilter.optimalNumOfBits(1, 0.03)); - assertEquals(72, BloomFilter.optimalNumOfBits(10, 0.03)); - assertEquals(729, BloomFilter.optimalNumOfBits(100, 0.03)); - assertEquals(7298, BloomFilter.optimalNumOfBits(1000, 0.03)); - assertEquals(72984, BloomFilter.optimalNumOfBits(10000, 0.03)); - assertEquals(729844, BloomFilter.optimalNumOfBits(100000, 0.03)); - assertEquals(7298440, BloomFilter.optimalNumOfBits(1000000, 0.03)); - assertEquals(6235224, BloomFilter.optimalNumOfBits(1000000, 0.05)); - } - - @Test - public void testBloomNumHashFunctions() { - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(-1, -1)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(0, 0)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(10, 0)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(10, 10)); - assertEquals(7, BloomFilter.optimalNumOfHashFunctions(10, 100)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(100, 100)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(1000, 100)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(10000, 100)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(100000, 100)); - assertEquals(1, BloomFilter.optimalNumOfHashFunctions(1000000, 100)); - } - - @Test - public void testBloomFilterBytes() { - BloomFilter bf = new BloomFilter(10000); - byte[] val = new byte[]{1, 2, 3}; - byte[] val1 = new byte[]{1, 2, 3, 4}; - byte[] val2 = new byte[]{1, 2, 3, 4, 5}; - byte[] val3 = new byte[]{1, 2, 3, 4, 5, 6}; - - assertEquals(false, bf.test(val)); - assertEquals(false, bf.test(val1)); - assertEquals(false, bf.test(val2)); - assertEquals(false, bf.test(val3)); - bf.add(val); - assertEquals(true, bf.test(val)); - assertEquals(false, bf.test(val1)); - assertEquals(false, bf.test(val2)); - assertEquals(false, bf.test(val3)); - bf.add(val1); - assertEquals(true, bf.test(val)); - assertEquals(true, bf.test(val1)); - assertEquals(false, bf.test(val2)); - assertEquals(false, bf.test(val3)); - bf.add(val2); - assertEquals(true, bf.test(val)); - assertEquals(true, bf.test(val1)); - assertEquals(true, bf.test(val2)); - assertEquals(false, bf.test(val3)); - bf.add(val3); - assertEquals(true, bf.test(val)); - assertEquals(true, bf.test(val1)); - assertEquals(true, bf.test(val2)); - assertEquals(true, bf.test(val3)); - - byte[] randVal = new byte[COUNT]; - for (int i = 0; i < COUNT; i++) { - rand.nextBytes(randVal); - bf.add(randVal); - } - // last value should be present - assertEquals(true, bf.test(randVal)); - // most likely this value should not exist - randVal[0] = 0; - randVal[1] = 0; - randVal[2] = 0; - randVal[3] = 0; - randVal[4] = 0; - assertEquals(false, bf.test(randVal)); - - assertEquals(7800, bf.sizeInBytes()); - } - - @Test - public void testBloomFilterByte() { - BloomFilter bf = new BloomFilter(10000); - byte val = Byte.MIN_VALUE; - byte val1 = 1; - byte val2 = 2; - byte val3 = Byte.MAX_VALUE; - - assertEquals(false, bf.testLong(val)); - assertEquals(false, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val); - assertEquals(true, bf.testLong(val)); - assertEquals(false, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val1); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val2); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(true, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val3); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(true, bf.testLong(val2)); - assertEquals(true, bf.testLong(val3)); - - byte randVal = 0; - for (int i = 0; i < COUNT; i++) { - randVal = (byte) rand.nextInt(Byte.MAX_VALUE); - bf.addLong(randVal); - } - // last value should be present - assertEquals(true, bf.testLong(randVal)); - // most likely this value should not exist - assertEquals(false, bf.testLong((byte) -120)); - - assertEquals(7800, bf.sizeInBytes()); - } - - @Test - public void testBloomFilterInt() { - BloomFilter bf = new BloomFilter(10000); - int val = Integer.MIN_VALUE; - int val1 = 1; - int val2 = 2; - int val3 = Integer.MAX_VALUE; - - assertEquals(false, bf.testLong(val)); - assertEquals(false, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val); - assertEquals(true, bf.testLong(val)); - assertEquals(false, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val1); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val2); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(true, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val3); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(true, bf.testLong(val2)); - assertEquals(true, bf.testLong(val3)); - - int randVal = 0; - for (int i = 0; i < COUNT; i++) { - randVal = rand.nextInt(); - bf.addLong(randVal); - } - // last value should be present - assertEquals(true, bf.testLong(randVal)); - // most likely this value should not exist - assertEquals(false, bf.testLong(-120)); - - assertEquals(7800, bf.sizeInBytes()); - } - - @Test - public void testBloomFilterLong() { - BloomFilter bf = new BloomFilter(10000); - long val = Long.MIN_VALUE; - long val1 = 1; - long val2 = 2; - long val3 = Long.MAX_VALUE; - - assertEquals(false, bf.testLong(val)); - assertEquals(false, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val); - assertEquals(true, bf.testLong(val)); - assertEquals(false, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val1); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(false, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val2); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(true, bf.testLong(val2)); - assertEquals(false, bf.testLong(val3)); - bf.addLong(val3); - assertEquals(true, bf.testLong(val)); - assertEquals(true, bf.testLong(val1)); - assertEquals(true, bf.testLong(val2)); - assertEquals(true, bf.testLong(val3)); - - long randVal = 0; - for (int i = 0; i < COUNT; i++) { - randVal = rand.nextLong(); - bf.addLong(randVal); - } - // last value should be present - assertEquals(true, bf.testLong(randVal)); - // most likely this value should not exist - assertEquals(false, bf.testLong(-120)); - - assertEquals(7800, bf.sizeInBytes()); - } - - @Test - public void testBloomFilterFloat() { - BloomFilter bf = new BloomFilter(10000); - float val = Float.MIN_VALUE; - float val1 = 1.1f; - float val2 = 2.2f; - float val3 = Float.MAX_VALUE; - - assertEquals(false, bf.testDouble(val)); - assertEquals(false, bf.testDouble(val1)); - assertEquals(false, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val); - assertEquals(true, bf.testDouble(val)); - assertEquals(false, bf.testDouble(val1)); - assertEquals(false, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val1); - assertEquals(true, bf.testDouble(val)); - assertEquals(true, bf.testDouble(val1)); - assertEquals(false, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val2); - assertEquals(true, bf.testDouble(val)); - assertEquals(true, bf.testDouble(val1)); - assertEquals(true, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val3); - assertEquals(true, bf.testDouble(val)); - assertEquals(true, bf.testDouble(val1)); - assertEquals(true, bf.testDouble(val2)); - assertEquals(true, bf.testDouble(val3)); - - float randVal = 0; - for (int i = 0; i < COUNT; i++) { - randVal = rand.nextFloat(); - bf.addDouble(randVal); - } - // last value should be present - assertEquals(true, bf.testDouble(randVal)); - // most likely this value should not exist - assertEquals(false, bf.testDouble(-120.2f)); - - assertEquals(7800, bf.sizeInBytes()); - } - - @Test - public void testBloomFilterDouble() { - BloomFilter bf = new BloomFilter(10000); - double val = Double.MIN_VALUE; - double val1 = 1.1d; - double val2 = 2.2d; - double val3 = Double.MAX_VALUE; - - assertEquals(false, bf.testDouble(val)); - assertEquals(false, bf.testDouble(val1)); - assertEquals(false, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val); - assertEquals(true, bf.testDouble(val)); - assertEquals(false, bf.testDouble(val1)); - assertEquals(false, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val1); - assertEquals(true, bf.testDouble(val)); - assertEquals(true, bf.testDouble(val1)); - assertEquals(false, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val2); - assertEquals(true, bf.testDouble(val)); - assertEquals(true, bf.testDouble(val1)); - assertEquals(true, bf.testDouble(val2)); - assertEquals(false, bf.testDouble(val3)); - bf.addDouble(val3); - assertEquals(true, bf.testDouble(val)); - assertEquals(true, bf.testDouble(val1)); - assertEquals(true, bf.testDouble(val2)); - assertEquals(true, bf.testDouble(val3)); - - double randVal = 0; - for (int i = 0; i < COUNT; i++) { - randVal = rand.nextDouble(); - bf.addDouble(randVal); - } - // last value should be present - assertEquals(true, bf.testDouble(randVal)); - // most likely this value should not exist - assertEquals(false, bf.testDouble(-120.2d)); - - assertEquals(7800, bf.sizeInBytes()); - } - - @Test - public void testBloomFilterString() { - BloomFilter bf = new BloomFilter(100000); - String val = "bloo"; - String val1 = "bloom fil"; - String val2 = "bloom filter"; - String val3 = "cuckoo filter"; - - assertEquals(false, bf.testString(val)); - assertEquals(false, bf.testString(val1)); - assertEquals(false, bf.testString(val2)); - assertEquals(false, bf.testString(val3)); - bf.addString(val); - assertEquals(true, bf.testString(val)); - assertEquals(false, bf.testString(val1)); - assertEquals(false, bf.testString(val2)); - assertEquals(false, bf.testString(val3)); - bf.addString(val1); - assertEquals(true, bf.testString(val)); - assertEquals(true, bf.testString(val1)); - assertEquals(false, bf.testString(val2)); - assertEquals(false, bf.testString(val3)); - bf.addString(val2); - assertEquals(true, bf.testString(val)); - assertEquals(true, bf.testString(val1)); - assertEquals(true, bf.testString(val2)); - assertEquals(false, bf.testString(val3)); - bf.addString(val3); - assertEquals(true, bf.testString(val)); - assertEquals(true, bf.testString(val1)); - assertEquals(true, bf.testString(val2)); - assertEquals(true, bf.testString(val3)); - - long randVal = 0; - for (int i = 0; i < COUNT; i++) { - randVal = rand.nextLong(); - bf.addString(Long.toString(randVal)); - } - // last value should be present - assertEquals(true, bf.testString(Long.toString(randVal))); - // most likely this value should not exist - assertEquals(false, bf.testString(Long.toString(-120))); - - assertEquals(77944, bf.sizeInBytes()); - } - - @Test - public void testMerge() { - BloomFilter bf = new BloomFilter(10000); - String val = "bloo"; - String val1 = "bloom fil"; - String val2 = "bloom filter"; - String val3 = "cuckoo filter"; - bf.addString(val); - bf.addString(val1); - bf.addString(val2); - bf.addString(val3); - - BloomFilter bf2 = new BloomFilter(10000); - String v = "2_bloo"; - String v1 = "2_bloom fil"; - String v2 = "2_bloom filter"; - String v3 = "2_cuckoo filter"; - bf2.addString(v); - bf2.addString(v1); - bf2.addString(v2); - bf2.addString(v3); - - assertEquals(true, bf.testString(val)); - assertEquals(true, bf.testString(val1)); - assertEquals(true, bf.testString(val2)); - assertEquals(true, bf.testString(val3)); - assertEquals(false, bf.testString(v)); - assertEquals(false, bf.testString(v1)); - assertEquals(false, bf.testString(v2)); - assertEquals(false, bf.testString(v3)); - - bf.merge(bf2); - - assertEquals(true, bf.testString(val)); - assertEquals(true, bf.testString(val1)); - assertEquals(true, bf.testString(val2)); - assertEquals(true, bf.testString(val3)); - assertEquals(true, bf.testString(v)); - assertEquals(true, bf.testString(v1)); - assertEquals(true, bf.testString(v2)); - assertEquals(true, bf.testString(v3)); - } -} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/filters/TestMurmur3.java b/ql/src/test/org/apache/hadoop/hive/ql/io/filters/TestMurmur3.java deleted file mode 100644 index d92a3ce..0000000 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/filters/TestMurmur3.java +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.filters; - -import static org.junit.Assert.assertEquals; - -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; - -import org.junit.Test; - -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Random; - -/** - * Tests for Murmur3 variants. - */ -public class TestMurmur3 { - - @Test - public void testHashCodesM3_32_string() { - String key = "test"; - int seed = 123; - HashFunction hf = Hashing.murmur3_32(seed); - int hc1 = hf.hashBytes(key.getBytes()).asInt(); - int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); - assertEquals(hc1, hc2); - - key = "testkey"; - hc1 = hf.hashBytes(key.getBytes()).asInt(); - hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); - assertEquals(hc1, hc2); - } - - @Test - public void testHashCodesM3_32_ints() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_32(seed); - for (int i = 0; i < 1000; i++) { - int val = rand.nextInt(); - byte[] data = ByteBuffer.allocate(4).putInt(val).array(); - int hc1 = hf.hashBytes(data).asInt(); - int hc2 = Murmur3.hash32(data, data.length, seed); - assertEquals(hc1, hc2); - } - } - - @Test - public void testHashCodesM3_32_longs() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_32(seed); - for (int i = 0; i < 1000; i++) { - long val = rand.nextLong(); - byte[] data = ByteBuffer.allocate(8).putLong(val).array(); - int hc1 = hf.hashBytes(data).asInt(); - int hc2 = Murmur3.hash32(data, data.length, seed); - assertEquals(hc1, hc2); - } - } - - @Test - public void testHashCodesM3_32_double() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_32(seed); - for (int i = 0; i < 1000; i++) { - double val = rand.nextDouble(); - byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); - int hc1 = hf.hashBytes(data).asInt(); - int hc2 = Murmur3.hash32(data, data.length, seed); - assertEquals(hc1, hc2); - } - } - - @Test - public void testHashCodesM3_128_string() { - String key = "test"; - int seed = 123; - HashFunction hf = Hashing.murmur3_128(seed); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(key.getBytes()).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(key.getBytes(), key.getBytes().length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - - key = "testkey128_testkey128"; - buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(key.getBytes()).asBytes()); - buf.flip(); - gl1 = buf.getLong(); - gl2 = buf.getLong(8); - hc = Murmur3.hash128(key.getBytes(), key.getBytes().length, seed); - m1 = hc[0]; - m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - } - - @Test - public void testHashCodesM3_128_ints() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_128(seed); - for (int i = 0; i < 1000; i++) { - int val = rand.nextInt(); - byte[] data = ByteBuffer.allocate(4).putInt(val).array(); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(data).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(data, data.length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - } - } - - @Test - public void testHashCodesM3_128_longs() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_128(seed); - for (int i = 0; i < 1000; i++) { - long val = rand.nextLong(); - byte[] data = ByteBuffer.allocate(8).putLong(val).array(); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(data).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(data, data.length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - } - } - - @Test - public void testHashCodesM3_128_double() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_128(seed); - for (int i = 0; i < 1000; i++) { - double val = rand.nextDouble(); - byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(data).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(data, data.length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - } - } -} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java index d0f3a5e..afbbb02 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java @@ -37,7 +37,7 @@ import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.hive.common.DiskRangeList; import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.io.filters.BloomFilter; +import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.Location; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; @@ -1148,7 +1148,7 @@ public void testPartialPlanString() throws Exception { public void testIntNullSafeEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.INTEGER, "x", 15L, null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong(i); } @@ -1163,7 +1163,7 @@ public void testIntNullSafeEqualsBloomFilter() throws Exception { public void testIntEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.INTEGER, "x", 15L, null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong(i); } @@ -1182,7 +1182,7 @@ public void testIntInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.INTEGER, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong(i); } @@ -1200,7 +1200,7 @@ public void testIntInBloomFilter() throws Exception { public void testDoubleNullSafeEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addDouble(i); } @@ -1215,7 +1215,7 @@ public void testDoubleNullSafeEqualsBloomFilter() throws Exception { public void testDoubleEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addDouble(i); } @@ -1234,7 +1234,7 @@ public void testDoubleInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addDouble(i); } @@ -1252,7 +1252,7 @@ public void testDoubleInBloomFilter() throws Exception { public void testStringNullSafeEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString("str_" + i); } @@ -1267,7 +1267,7 @@ public void testStringNullSafeEqualsBloomFilter() throws Exception { public void testStringEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString("str_" + i); } @@ -1286,7 +1286,7 @@ public void testStringInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString("str_" + i); } @@ -1305,7 +1305,7 @@ public void testDateWritableNullSafeEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15), null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new DateWritable(i)).getDays()); } @@ -1320,7 +1320,7 @@ public void testDateWritableNullSafeEqualsBloomFilter() throws Exception { public void testDateWritableEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15), null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new DateWritable(i)).getDays()); } @@ -1339,7 +1339,7 @@ public void testDateWritableInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new DateWritable(i)).getDays()); } @@ -1359,7 +1359,7 @@ public void testTimestampNullSafeEqualsBloomFilter() throws Exception { PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new Timestamp(i)).getTime()); } @@ -1374,7 +1374,7 @@ public void testTimestampNullSafeEqualsBloomFilter() throws Exception { public void testTimestampEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new Timestamp(i)).getTime()); } @@ -1393,7 +1393,7 @@ public void testTimestampInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new Timestamp(i)).getTime()); } @@ -1413,7 +1413,7 @@ public void testDecimalNullSafeEqualsBloomFilter() throws Exception { PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", HiveDecimal.create(15), null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString(HiveDecimal.create(i).toString()); } @@ -1429,7 +1429,7 @@ public void testDecimalEqualsBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x", HiveDecimal.create(15), null); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString(HiveDecimal.create(i).toString()); } @@ -1448,7 +1448,7 @@ public void testDecimalInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString(HiveDecimal.create(i).toString()); } @@ -1471,7 +1471,7 @@ public void testNullsInBloomFilter() throws Exception { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL, "x", null, args); - BloomFilter bf = new BloomFilter(10000); + BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { bf.addString(HiveDecimal.create(i).toString()); } diff --git a/ql/src/test/queries/clientpositive/annotate_stats_part.q b/ql/src/test/queries/clientpositive/annotate_stats_part.q index fcfe566..f0a68a0 100644 --- a/ql/src/test/queries/clientpositive/annotate_stats_part.q +++ b/ql/src/test/queries/clientpositive/annotate_stats_part.q @@ -2,6 +2,7 @@ set hive.stats.fetch.column.stats=true; set hive.stats.autogather=false; set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; +set hive.metastore.aggregate.stats.cache.enabled=false; create table if not exists loc_staging ( state string,