diff --git standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java index d5ac54a..d454e37 100644 --- standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java +++ standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java @@ -104,21 +104,21 @@ /** *
    * Input: 64 bit hashcode
-   * 
+   *
    * |---------w-------------| |------------p'----------|
    * 10101101.......1010101010 10101010101 01010101010101
    *                                       |------p-----|
-   *                                       
+   *
    * Output: 32 bit int
-   * 
+   *
    * |b| |-q'-|  |------------p'----------|
    *  1  010101  01010101010 10101010101010
    *                         |------p-----|
-   *                    
-   * 
+   *
+   *
    * The default values of p', q' and b are 25, 6, 1 (total 32 bits) respectively.
    * This function will return an int encoded in the following format
-   * 
+   *
    * p  - LSB p bits represent the register index
    * p' - LSB p' bits are used for increased accuracy in estimation
    * q' - q' bits after p' are left as such from the hashcode if b = 0 else
@@ -148,8 +148,12 @@
     }
   }
 
-  public int getSize() {
-    return sparseMap.size() + tempListIdx;
+  public boolean isSizeGreaterThan(int s) {
+    if (sparseMap.size() + tempListIdx > s) {
+      mergeTempListToSparseMap();
+      return sparseMap.size() > s;
+    }
+    return false;
   }
 
   public void merge(HLLRegister hllRegister) {
@@ -195,7 +199,7 @@
       byte lr = entry.getValue(); // this can be a max of 65, never > 127
       if (lr != 0) {
         // should be a no-op for sparse
-        dest.add((long) ((1 << (p + lr - 1)) | idx));
+        dest.add((1 << (p + lr - 1)) | idx);
       }
     }
   }
diff --git standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java
index 91a6865..edf587f 100644
--- standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java
+++ standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java
@@ -20,7 +20,6 @@
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.Map;
 import java.util.TreeMap;
@@ -30,17 +29,19 @@
 import org.apache.hadoop.hive.ql.util.JavaDataModel;
 import org.apache.hive.common.util.Murmur3;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * 
  * This is an implementation of the following variants of hyperloglog (HLL)
- * algorithm 
+ * algorithm
  * Original  - Original HLL algorithm from Flajolet et. al from
  *             http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
  * HLLNoBias - Google's implementation of bias correction based on lookup table
  *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
  * HLL++     - Google's implementation of HLL++ algorithm that uses SPARSE registers
  *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
- * 
+ *
  * Following are the constructor parameters that determines which algorithm is
  * used
  * numRegisterIndexBits - number of LSB hashcode bits to be used as register index.
@@ -194,7 +195,7 @@
     } else if (hashBits <= 64) {
       alphaMM = 0.709f;
     } else {
-      alphaMM = 0.7213f / (float) (1 + 1.079f / m);
+      alphaMM = 0.7213f / (1 + 1.079f / m);
     }
 
     // For efficiency alpha is multiplied by m^2
@@ -258,7 +259,7 @@
 
       // if size of sparse map excess the threshold convert the sparse map to
       // dense register and switch to DENSE encoding
-      if (sparseRegister.getSize() > encodingSwitchThreshold) {
+      if (sparseRegister.isSizeGreaterThan(encodingSwitchThreshold)) {
         encoding = EncodingType.DENSE;
         denseRegister = sparseToDenseRegister(sparseRegister);
         sparseRegister = null;
@@ -386,7 +387,7 @@
   }
 
   private long linearCount(int mVal, long numZeros) {
-    return (long) (Math.round(mVal * Math.log(mVal / ((double) numZeros))));
+    return (Math.round(mVal * Math.log(mVal / ((double) numZeros))));
   }
 
   // refer paper
@@ -459,7 +460,7 @@
       sparseRegister.merge(hll.getHLLSparseRegister());
       // if after merge the sparse switching threshold is exceeded then change
       // to dense encoding
-      if (sparseRegister.getSize() > encodingSwitchThreshold) {
+      if (sparseRegister.isSizeGreaterThan(encodingSwitchThreshold)) {
         encoding = EncodingType.DENSE;
         denseRegister = sparseToDenseRegister(sparseRegister);
         sparseRegister = null;
@@ -481,7 +482,7 @@
 
   /**
    * Reduces the accuracy of the HLL provided to a smaller size
-   * @param p0 
+   * @param p0
    *         - new p size for the new HyperLogLog (smaller or no change)
    * @return reduced (or same) HyperLogLog instance
    */
@@ -661,4 +662,9 @@
     return o instanceof HyperLogLog;
   }
 
+  @VisibleForTesting
+  public int getEncodingSwitchThreshold() {
+    return encodingSwitchThreshold;
+  }
+
 }
diff --git standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
index e014fb5..e720ec8 100644
--- standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
+++ standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
@@ -18,6 +18,7 @@
 package org.apache.hadoop.hive.common.ndv.hll;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.EncodingType;
 import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
@@ -49,27 +50,27 @@
     double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
     double delta = threshold * size / 100;
     double delta4 = threshold * (4*size) / 100;
-    assertEquals((double) size, (double) hll.count(), delta);
-    assertEquals((double) size, (double) hll2.count(), delta);
+    assertEquals(size, hll.count(), delta);
+    assertEquals(size, hll2.count(), delta);
 
     // merge
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // merge should update registers and hence the count
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // new merge
     hll.merge(hll3);
-    assertEquals((double) 3 * size, (double) hll.count(), delta);
+    assertEquals((double) 3 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
-    // valid merge -- register set size gets bigger (also 4k items 
+    // valid merge -- register set size gets bigger (also 4k items
     hll.merge(hll4);
-    assertEquals((double) 4 * size, (double) hll.count(), delta4);
+    assertEquals((double) 4 * size, hll.count(), delta4);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // invalid merge -- smaller register merge to bigger
@@ -95,27 +96,27 @@
     double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
     double delta = threshold * size / 100;
     double delta4 = threshold * (4*size) / 100;
-    assertEquals((double) size, (double) hll.count(), delta);
-    assertEquals((double) size, (double) hll2.count(), delta);
+    assertEquals(size, hll.count(), delta);
+    assertEquals(size, hll2.count(), delta);
 
     // merge
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // merge should update registers and hence the count
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // new merge
     hll.merge(hll3);
-    assertEquals((double) 3 * size, (double) hll.count(), delta);
+    assertEquals((double) 3 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // valid merge -- register set size gets bigger & dense automatically
     hll.merge(hll4);
-    assertEquals((double) 4 * size, (double) hll.count(), delta4);
+    assertEquals((double) 4 * size, hll.count(), delta4);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // invalid merge -- smaller register merge to bigger
@@ -140,27 +141,27 @@
     }
     double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
     double delta = threshold * size / 100;
-    assertEquals((double) size, (double) hll.count(), delta);
-    assertEquals((double) size, (double) hll2.count(), delta);
+    assertEquals(size, hll.count(), delta);
+    assertEquals(size, hll2.count(), delta);
 
     // sparse-sparse merge
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // merge should update registers and hence the count
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // sparse-dense merge
     hll.merge(hll3);
-    assertEquals((double) 3 * size, (double) hll.count(), delta);
+    assertEquals((double) 3 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // merge should convert hll2 to DENSE
     hll2.merge(hll4);
-    assertEquals((double) 2 * size, (double) hll2.count(), delta);
+    assertEquals((double) 2 * size, hll2.count(), delta);
     assertEquals(EncodingType.DENSE, hll2.getEncoding());
 
     // invalid merge -- smaller register merge to bigger
@@ -185,27 +186,27 @@
     }
     double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
     double delta = threshold * size / 100;
-    assertEquals((double) size, (double) hll.count(), delta);
-    assertEquals((double) size, (double) hll2.count(), delta);
+    assertEquals(size, hll.count(), delta);
+    assertEquals(size, hll2.count(), delta);
 
     // sparse-sparse merge
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // merge should update registers and hence the count
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // sparse-dense merge
     hll.merge(hll3);
-    assertEquals((double) 3 * size, (double) hll.count(), delta);
+    assertEquals((double) 3 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // merge should convert hll3 to DENSE
     hll3.merge(hll4);
-    assertEquals((double) 2 * size, (double) hll3.count(), delta);
+    assertEquals((double) 2 * size, hll3.count(), delta);
     assertEquals(EncodingType.DENSE, hll3.getEncoding());
 
     // invalid merge -- smaller register merge to bigger
@@ -231,27 +232,27 @@
     }
     double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
     double delta = threshold * size / 100;
-    assertEquals((double) size, (double) hll.count(), delta);
-    assertEquals((double) size, (double) hll2.count(), delta);
+    assertEquals(size, hll.count(), delta);
+    assertEquals(size, hll2.count(), delta);
 
     // sparse-sparse merge
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // merge should update registers and hence the count
     hll.merge(hll2);
-    assertEquals((double) 2 * size, (double) hll.count(), delta);
+    assertEquals((double) 2 * size, hll.count(), delta);
     assertEquals(EncodingType.SPARSE, hll.getEncoding());
 
     // sparse-sparse overload to dense
     hll.merge(hll3);
-    assertEquals((double) 3 * size, (double) hll.count(), delta);
+    assertEquals((double) 3 * size, hll.count(), delta);
     assertEquals(EncodingType.DENSE, hll.getEncoding());
 
     // merge should convert hll2 to DENSE
     hll2.merge(hll4);
-    assertEquals((double) 2 * size, (double) hll2.count(), delta);
+    assertEquals((double) 2 * size, hll2.count(), delta);
     assertEquals(EncodingType.DENSE, hll2.getEncoding());
 
     // invalid merge -- smaller register merge to bigger
@@ -268,7 +269,7 @@
     }
     double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
     double delta = threshold * size / 100;
-    assertEquals((double) size, (double) hll.count(), delta);
+    assertEquals(size, hll.count(), delta);
   }
 
   @Test
@@ -296,7 +297,7 @@
               .squash(small.getNumRegisterIndexBits());
           assertEquals(small.count(), mush.count(), 0);
           double delta = Math.ceil(small.getStandardError()*size);
-          assertEquals((double) size, (double) mush.count(), delta);
+          assertEquals(size, mush.count(), delta);
         }
       }
     }
@@ -316,7 +317,7 @@
     }
 
     p14HLL.squash(p10HLL.getNumRegisterIndexBits());
-    assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0);
+    assertEquals(size, p14HLL.count(), longRangeTolerance * size / 100.0);
   }
 
   @Test
@@ -333,6 +334,26 @@
     }
 
     p14HLL.squash(p10HLL.getNumRegisterIndexBits());
-    assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0);
+    assertEquals(size, p14HLL.count(), longRangeTolerance * size / 100.0);
   }
+
+  @Test
+  public void testAbletoRetainAccuracyUpToSwitchThreshold() {
+    int maxThreshold = HyperLogLog.builder().setSizeOptimized().build().getEncodingSwitchThreshold();
+    testRetainAccuracy(70);
+    testRetainAccuracy(maxThreshold / 2);
+    testRetainAccuracy(maxThreshold);
+  }
+
+  private void testRetainAccuracy(int numElements) {
+    HyperLogLog h = HyperLogLog.builder().setSizeOptimized().build();
+    assertTrue(numElements <= h.getEncodingSwitchThreshold());
+    for (int ia = 0; ia <= 10; ia++) {
+      for (int i = 1; i <= numElements; i++) {
+        h.addLong(i);
+      }
+    }
+    assertEquals(numElements, h.estimateNumDistinctValues());
+  }
+
 }