diff --git standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java index d5ac54a..d007cc5 100644 --- standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java +++ standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java @@ -104,21 +104,21 @@ /** *
* Input: 64 bit hashcode
- *
+ *
* |---------w-------------| |------------p'----------|
* 10101101.......1010101010 10101010101 01010101010101
* |------p-----|
- *
+ *
* Output: 32 bit int
- *
+ *
* |b| |-q'-| |------------p'----------|
* 1 010101 01010101010 10101010101010
* |------p-----|
- *
- *
+ *
+ *
* The default values of p', q' and b are 25, 6, 1 (total 32 bits) respectively.
* This function will return an int encoded in the following format
- *
+ *
* p - LSB p bits represent the register index
* p' - LSB p' bits are used for increased accuracy in estimation
* q' - q' bits after p' are left as such from the hashcode if b = 0 else
@@ -148,8 +148,12 @@
}
}
- public int getSize() {
- return sparseMap.size() + tempListIdx;
+ public boolean isSizeGreaterThan(int s) {
+ if (sparseMap.size() + tempListIdx >= s) {
+ mergeTempListToSparseMap();
+ return sparseMap.size() > s;
+ }
+ return false;
}
public void merge(HLLRegister hllRegister) {
@@ -195,7 +199,7 @@
byte lr = entry.getValue(); // this can be a max of 65, never > 127
if (lr != 0) {
// should be a no-op for sparse
- dest.add((long) ((1 << (p + lr - 1)) | idx));
+ dest.add((1 << (p + lr - 1)) | idx);
}
}
}
diff --git standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java
index 91a6865..edf587f 100644
--- standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java
+++ standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java
@@ -20,7 +20,6 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
-import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.TreeMap;
@@ -30,17 +29,19 @@
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hive.common.util.Murmur3;
+import com.google.common.annotations.VisibleForTesting;
+
/**
*
* This is an implementation of the following variants of hyperloglog (HLL)
- * algorithm
+ * algorithm
* Original - Original HLL algorithm from Flajolet et. al from
* http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
* HLLNoBias - Google's implementation of bias correction based on lookup table
* http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
* HLL++ - Google's implementation of HLL++ algorithm that uses SPARSE registers
* http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
- *
+ *
* Following are the constructor parameters that determines which algorithm is
* used
* numRegisterIndexBits - number of LSB hashcode bits to be used as register index.
@@ -194,7 +195,7 @@
} else if (hashBits <= 64) {
alphaMM = 0.709f;
} else {
- alphaMM = 0.7213f / (float) (1 + 1.079f / m);
+ alphaMM = 0.7213f / (1 + 1.079f / m);
}
// For efficiency alpha is multiplied by m^2
@@ -258,7 +259,7 @@
// if size of sparse map excess the threshold convert the sparse map to
// dense register and switch to DENSE encoding
- if (sparseRegister.getSize() > encodingSwitchThreshold) {
+ if (sparseRegister.isSizeGreaterThan(encodingSwitchThreshold)) {
encoding = EncodingType.DENSE;
denseRegister = sparseToDenseRegister(sparseRegister);
sparseRegister = null;
@@ -386,7 +387,7 @@
}
private long linearCount(int mVal, long numZeros) {
- return (long) (Math.round(mVal * Math.log(mVal / ((double) numZeros))));
+ return (Math.round(mVal * Math.log(mVal / ((double) numZeros))));
}
// refer paper
@@ -459,7 +460,7 @@
sparseRegister.merge(hll.getHLLSparseRegister());
// if after merge the sparse switching threshold is exceeded then change
// to dense encoding
- if (sparseRegister.getSize() > encodingSwitchThreshold) {
+ if (sparseRegister.isSizeGreaterThan(encodingSwitchThreshold)) {
encoding = EncodingType.DENSE;
denseRegister = sparseToDenseRegister(sparseRegister);
sparseRegister = null;
@@ -481,7 +482,7 @@
/**
* Reduces the accuracy of the HLL provided to a smaller size
- * @param p0
+ * @param p0
* - new p size for the new HyperLogLog (smaller or no change)
* @return reduced (or same) HyperLogLog instance
*/
@@ -661,4 +662,9 @@
return o instanceof HyperLogLog;
}
+ @VisibleForTesting
+ public int getEncodingSwitchThreshold() {
+ return encodingSwitchThreshold;
+ }
+
}
diff --git standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
index e014fb5..e720ec8 100644
--- standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
+++ standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hive.common.ndv.hll;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.EncodingType;
import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
@@ -49,27 +50,27 @@
double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
double delta = threshold * size / 100;
double delta4 = threshold * (4*size) / 100;
- assertEquals((double) size, (double) hll.count(), delta);
- assertEquals((double) size, (double) hll2.count(), delta);
+ assertEquals(size, hll.count(), delta);
+ assertEquals(size, hll2.count(), delta);
// merge
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// merge should update registers and hence the count
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// new merge
hll.merge(hll3);
- assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals((double) 3 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
- // valid merge -- register set size gets bigger (also 4k items
+ // valid merge -- register set size gets bigger (also 4k items
hll.merge(hll4);
- assertEquals((double) 4 * size, (double) hll.count(), delta4);
+ assertEquals((double) 4 * size, hll.count(), delta4);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// invalid merge -- smaller register merge to bigger
@@ -95,27 +96,27 @@
double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
double delta = threshold * size / 100;
double delta4 = threshold * (4*size) / 100;
- assertEquals((double) size, (double) hll.count(), delta);
- assertEquals((double) size, (double) hll2.count(), delta);
+ assertEquals(size, hll.count(), delta);
+ assertEquals(size, hll2.count(), delta);
// merge
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// merge should update registers and hence the count
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// new merge
hll.merge(hll3);
- assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals((double) 3 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// valid merge -- register set size gets bigger & dense automatically
hll.merge(hll4);
- assertEquals((double) 4 * size, (double) hll.count(), delta4);
+ assertEquals((double) 4 * size, hll.count(), delta4);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// invalid merge -- smaller register merge to bigger
@@ -140,27 +141,27 @@
}
double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
double delta = threshold * size / 100;
- assertEquals((double) size, (double) hll.count(), delta);
- assertEquals((double) size, (double) hll2.count(), delta);
+ assertEquals(size, hll.count(), delta);
+ assertEquals(size, hll2.count(), delta);
// sparse-sparse merge
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// merge should update registers and hence the count
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// sparse-dense merge
hll.merge(hll3);
- assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals((double) 3 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// merge should convert hll2 to DENSE
hll2.merge(hll4);
- assertEquals((double) 2 * size, (double) hll2.count(), delta);
+ assertEquals((double) 2 * size, hll2.count(), delta);
assertEquals(EncodingType.DENSE, hll2.getEncoding());
// invalid merge -- smaller register merge to bigger
@@ -185,27 +186,27 @@
}
double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
double delta = threshold * size / 100;
- assertEquals((double) size, (double) hll.count(), delta);
- assertEquals((double) size, (double) hll2.count(), delta);
+ assertEquals(size, hll.count(), delta);
+ assertEquals(size, hll2.count(), delta);
// sparse-sparse merge
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// merge should update registers and hence the count
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// sparse-dense merge
hll.merge(hll3);
- assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals((double) 3 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// merge should convert hll3 to DENSE
hll3.merge(hll4);
- assertEquals((double) 2 * size, (double) hll3.count(), delta);
+ assertEquals((double) 2 * size, hll3.count(), delta);
assertEquals(EncodingType.DENSE, hll3.getEncoding());
// invalid merge -- smaller register merge to bigger
@@ -231,27 +232,27 @@
}
double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
double delta = threshold * size / 100;
- assertEquals((double) size, (double) hll.count(), delta);
- assertEquals((double) size, (double) hll2.count(), delta);
+ assertEquals(size, hll.count(), delta);
+ assertEquals(size, hll2.count(), delta);
// sparse-sparse merge
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// merge should update registers and hence the count
hll.merge(hll2);
- assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals((double) 2 * size, hll.count(), delta);
assertEquals(EncodingType.SPARSE, hll.getEncoding());
// sparse-sparse overload to dense
hll.merge(hll3);
- assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals((double) 3 * size, hll.count(), delta);
assertEquals(EncodingType.DENSE, hll.getEncoding());
// merge should convert hll2 to DENSE
hll2.merge(hll4);
- assertEquals((double) 2 * size, (double) hll2.count(), delta);
+ assertEquals((double) 2 * size, hll2.count(), delta);
assertEquals(EncodingType.DENSE, hll2.getEncoding());
// invalid merge -- smaller register merge to bigger
@@ -268,7 +269,7 @@
}
double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
double delta = threshold * size / 100;
- assertEquals((double) size, (double) hll.count(), delta);
+ assertEquals(size, hll.count(), delta);
}
@Test
@@ -296,7 +297,7 @@
.squash(small.getNumRegisterIndexBits());
assertEquals(small.count(), mush.count(), 0);
double delta = Math.ceil(small.getStandardError()*size);
- assertEquals((double) size, (double) mush.count(), delta);
+ assertEquals(size, mush.count(), delta);
}
}
}
@@ -316,7 +317,7 @@
}
p14HLL.squash(p10HLL.getNumRegisterIndexBits());
- assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0);
+ assertEquals(size, p14HLL.count(), longRangeTolerance * size / 100.0);
}
@Test
@@ -333,6 +334,26 @@
}
p14HLL.squash(p10HLL.getNumRegisterIndexBits());
- assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0);
+ assertEquals(size, p14HLL.count(), longRangeTolerance * size / 100.0);
}
+
+ @Test
+ public void testAbletoRetainAccuracyUpToSwitchThreshold() {
+ int maxThreshold = HyperLogLog.builder().setSizeOptimized().build().getEncodingSwitchThreshold();
+ testRetainAccuracy(70);
+ testRetainAccuracy(maxThreshold / 2);
+ testRetainAccuracy(maxThreshold);
+ }
+
+ private void testRetainAccuracy(int numElements) {
+ HyperLogLog h = HyperLogLog.builder().setSizeOptimized().build();
+ assertTrue(numElements <= h.getEncodingSwitchThreshold());
+ for (int ia = 0; ia <= 10; ia++) {
+ for (int i = 1; i <= numElements; i++) {
+ h.addLong(i);
+ }
+ }
+ assertEquals(numElements, h.estimateNumDistinctValues());
+ }
+
}