diff --git ql/src/test/queries/clientpositive/bucket_map_join_tez2.q ql/src/test/queries/clientpositive/bucket_map_join_tez2.q index 1361e32..46fd20d 100644 --- ql/src/test/queries/clientpositive/bucket_map_join_tez2.q +++ ql/src/test/queries/clientpositive/bucket_map_join_tez2.q @@ -37,9 +37,11 @@ analyze table tab_part compute statistics for columns; set hive.auto.convert.join.noconditionaltask.size=1500; set hive.convert.join.bucket.mapjoin.tez = false; +set hive.cbo.enable=false; explain select a.key, b.key from tab_part a join tab_part c on a.key = c.key join tab_part b on a.value = b.value; set hive.convert.join.bucket.mapjoin.tez = true; explain select a.key, b.key from tab_part a join tab_part c on a.key = c.key join tab_part b on a.value = b.value; +reset hive.cbo.enable; CREATE TABLE tab1(key int, value string) CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; insert overwrite table tab1 @@ -119,4 +121,4 @@ insert into small values (1),(2),(3),(4),(5),(6); insert into big partition(k=1) values(1),(3),(5),(7),(9); insert into big partition(k=2) values(0),(2),(4),(6),(8); explain select small.i, big.i from small,big where small.i=big.i; -select small.i, big.i from small,big where small.i=big.i order by small.i, big.i; \ No newline at end of file +select small.i, big.i from small,big where small.i=big.i order by small.i, big.i; diff --git standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java index 4e4dfb7..b630fa3 100644 --- standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java +++ standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.common.ndv.fm.FMSketch; import org.apache.hadoop.hive.common.ndv.fm.FMSketchUtils; import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLogUtils; public class NumDistinctValueEstimatorFactory { @@ -44,7 +45,7 @@ public static NumDistinctValueEstimator getNumDistinctValueEstimator(byte[] buf) if (isFMSketch(buf)) { return FMSketchUtils.deserializeFM(buf); } else { - return HyperLogLog.builder().build().deserialize(buf); + return HyperLogLogUtils.deserializeHLL(buf); } } catch (IOException e) { throw new RuntimeException(e); @@ -56,7 +57,7 @@ public static NumDistinctValueEstimator getEmptyNumDistinctValueEstimator( if (n instanceof FMSketch) { return new FMSketch(((FMSketch) n).getNumBitVectors()); } else { - return HyperLogLog.builder().build(); + return HyperLogLog.builder().setSizeOptimized().build(); } } @@ -65,7 +66,7 @@ public static NumDistinctValueEstimator getEmptyNumDistinctValueEstimator(String if ("fm".equals(func.toLowerCase())) { return new FMSketch(numBitVectors); } else if ("hll".equals(func.toLowerCase())) { - return HyperLogLog.builder().build(); + return HyperLogLog.builder().setSizeOptimized().build(); } else { throw new RuntimeException("Can not recognize " + func); } diff --git standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java index 12897fc..422bfbe 100644 --- standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java +++ standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java @@ -62,6 +62,31 @@ public boolean add(long hashcode) { return set(registerIdx, (byte) lr); } + // this is a lossy invert of the function above, which produces a hashcode + // which collides with the current winner of the register (we lose all higher + // bits, but we get all bits useful for lesser p-bit options) + + // +-------------|-------------+ + // |xxxx100000000|1000000000000| (lr=9 + idx=1024) + // +-------------|-------------+ + // \ + // +---------------|-----------+ + // |xxxx10000000010|00000000000| (lr=2 + idx=0) + // +---------------|-----------+ + + // This shows the relevant bits of the original hash value + // and how the conversion is moving bits from the index value + // over to the leading zero computation + + public void extractLowBitsTo(HLLRegister dest) { + for (int idx = 0; idx < register.length; idx++) { + byte lr = register[idx]; // this can be a max of 65, never > 127 + if (lr != 0) { + dest.add((long) ((1 << (p + lr - 1)) | idx)); + } + } + } + public boolean set(int idx, byte value) { boolean updated = false; if (idx < register.length && value > register[idx]) { diff --git standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java index 82085dd..deaca9d 100644 --- standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java +++ standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.common.ndv.hll; import java.util.Map; +import java.util.Map.Entry; import java.util.TreeMap; public class HLLSparseRegister implements HLLRegister { @@ -199,6 +200,18 @@ public boolean set(int key, byte value) { return sparseMap; } + // this is effectively the same as the dense register impl. + public void extractLowBitsTo(HLLRegister dest) { + for (Entry entry : sparseMap.entrySet()) { + int idx = entry.getKey(); + byte lr = entry.getValue(); // this can be a max of 65, never > 127 + if (lr != 0) { + // should be a no-op for sparse + dest.add((long) ((1 << (p + lr - 1)) | idx)); + } + } + } + public int getP() { return p; } diff --git standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java index 8bdb47b..ec33691 100644 --- standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java +++ standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java @@ -18,10 +18,8 @@ package org.apache.hadoop.hive.common.ndv.hll; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.Map; @@ -163,6 +161,13 @@ public HyperLogLogBuilder setNumRegisterIndexBits(int b) { return this; } + public HyperLogLogBuilder setSizeOptimized() { + // allowing this to be increased via config breaks the merge impl + // p=10 = ~1kb per vector or smaller + this.numRegisterIndexBits = 10; + return this; + } + public HyperLogLogBuilder setEncoding(EncodingType enc) { this.encoding = enc; return this; @@ -440,12 +445,23 @@ public void setHLLDenseRegister(byte[] reg) { * @throws IllegalArgumentException */ public void merge(HyperLogLog hll) { - if (p != hll.p || chosenHashBits != hll.chosenHashBits) { + if (chosenHashBits != hll.chosenHashBits) { throw new IllegalArgumentException( "HyperLogLog cannot be merged as either p or hashbits are different. Current: " + toString() + " Provided: " + hll.toString()); } + if (p > hll.p) { + throw new IllegalArgumentException( + "HyperLogLog cannot merge a smaller p into a larger one : " + + toString() + " Provided: " + hll.toString()); + } + + if (p != hll.p) { + // invariant: p > hll.p + hll = hll.squash(p); + } + EncodingType otherEncoding = hll.getEncoding(); if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.SPARSE)) { @@ -473,7 +489,37 @@ public void merge(HyperLogLog hll) { } /** - * Converts sparse to dense hll register + * Reduces the accuracy of the HLL provided to a smaller size + * @param p0 + * - new p size for the new HyperLogLog (smaller or no change) + * @return reduced (or same) HyperLogLog instance + */ + public HyperLogLog squash(final int p0) { + if (p0 > p) { + throw new IllegalArgumentException( + "HyperLogLog cannot be be squashed to be bigger. Current: " + + toString() + " Provided: " + p0); + } + + if (p0 == p) { + return this; + } + + final HyperLogLog hll = new HyperLogLogBuilder() + .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) + .enableNoBias(noBias).build(); + final HLLDenseRegister result = hll.denseRegister; + + if (encoding == EncodingType.SPARSE) { + sparseRegister.extractLowBitsTo(result); + } else if (encoding == EncodingType.DENSE) { + denseRegister.extractLowBitsTo(result); + } + return hll; + } + + /** + * Converts sparse to dense hll register. * @param sparseRegister * - sparse register to be converted * @return converted dense register @@ -585,14 +631,7 @@ public void reset() { @Override public NumDistinctValueEstimator deserialize(byte[] buf) { - InputStream is = new ByteArrayInputStream(buf); - try { - HyperLogLog result = HyperLogLogUtils.deserializeHLL(is); - is.close(); - return result; - } catch (IOException e) { - throw new RuntimeException(e); - } + return HyperLogLogUtils.deserializeHLL(buf); } @Override diff --git standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java index 4e6510b..aeba2e9 100644 --- standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java +++ standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.common.ndv.hll; +import java.io.ByteArrayInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; @@ -126,7 +127,7 @@ public static void serializeHLL(OutputStream out, HyperLogLog hll) throws IOExce } /** - * Refer serializeHLL() for format of serialization. This funtions + * Refer serializeHLL() for format of serialization. This function * deserializes the serialized hyperloglogs * @param in * - input stream @@ -198,6 +199,22 @@ public static HyperLogLog deserializeHLL(InputStream in) throws IOException { return result; } + /** + * This function deserializes the serialized hyperloglogs from a byte array. + * @param buf - to deserialize + * @return HyperLogLog + */ + public static HyperLogLog deserializeHLL(final byte[] buf) { + InputStream is = new ByteArrayInputStream(buf); // TODO: use faster non-sync inputstream + try { + HyperLogLog result = deserializeHLL(is); + is.close(); + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + private static void bitpackHLLRegister(OutputStream out, byte[] register, int bitWidth) throws IOException { int bitsLeft = 8; diff --git standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java index 617d9c3..e014fb5 100644 --- standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java +++ standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java @@ -37,14 +37,18 @@ public void testHLLDenseMerge() { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.DENSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; + double delta4 = threshold * (4*size) / 100; assertEquals((double) size, (double) hll.count(), delta); assertEquals((double) size, (double) hll2.count(), delta); @@ -63,8 +67,13 @@ public void testHLLDenseMerge() { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match + // valid merge -- register set size gets bigger (also 4k items hll.merge(hll4); + assertEquals((double) 4 * size, (double) hll.count(), delta4); + assertEquals(EncodingType.DENSE, hll.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test(expected = IllegalArgumentException.class) @@ -74,14 +83,18 @@ public void testHLLSparseMerge() { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.SPARSE).build(); int size = 500; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; + double delta4 = threshold * (4*size) / 100; assertEquals((double) size, (double) hll.count(), delta); assertEquals((double) size, (double) hll2.count(), delta); @@ -100,8 +113,13 @@ public void testHLLSparseMerge() { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.SPARSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match + // valid merge -- register set size gets bigger & dense automatically hll.merge(hll4); + assertEquals((double) 4 * size, (double) hll.count(), delta4); + assertEquals(EncodingType.DENSE, hll.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test(expected = IllegalArgumentException.class) @@ -111,11 +129,14 @@ public void testHLLSparseDenseMerge() { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.DENSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; @@ -137,8 +158,13 @@ public void testHLLSparseDenseMerge() { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match - hll.merge(hll4); + // merge should convert hll2 to DENSE + hll2.merge(hll4); + assertEquals((double) 2 * size, (double) hll2.count(), delta); + assertEquals(EncodingType.DENSE, hll2.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test(expected = IllegalArgumentException.class) @@ -148,11 +174,14 @@ public void testHLLDenseSparseMerge() { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.SPARSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; @@ -174,8 +203,14 @@ public void testHLLDenseSparseMerge() { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match - hll.merge(hll4); + // merge should convert hll3 to DENSE + hll3.merge(hll4); + assertEquals((double) 2 * size, (double) hll3.count(), delta); + assertEquals(EncodingType.DENSE, hll3.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); + } @Test(expected = IllegalArgumentException.class) @@ -185,11 +220,14 @@ public void testHLLSparseOverflowMerge() { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.SPARSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; @@ -211,8 +249,13 @@ public void testHLLSparseOverflowMerge() { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match - hll.merge(hll4); + // merge should convert hll2 to DENSE + hll2.merge(hll4); + assertEquals((double) 2 * size, (double) hll2.count(), delta); + assertEquals(EncodingType.DENSE, hll2.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test @@ -227,4 +270,69 @@ public void testHLLSparseMoreRegisterBits() { double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); } + + @Test + public void testHLLSquash() { + + int[] sizes = new int[] { 500, 1000, 2300, 4096}; + int minBits = 9; + for (final int size : sizes) { + + HyperLogLog hlls[] = new HyperLogLog[16]; + for (int k = minBits; k < hlls.length; k++) { + final HyperLogLog hll = HyperLogLog.builder() + .setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(k).build(); + for (int i = 0; i < size; i++) { + hll.addLong(i); + } + hlls[k] = hll; + } + + for (int k = minBits; k < hlls.length; k++) { + for (int j = k + 1; j < hlls.length; j++) { + final HyperLogLog large = hlls[j]; + final HyperLogLog small = hlls[k]; + final HyperLogLog mush = large + .squash(small.getNumRegisterIndexBits()); + assertEquals(small.count(), mush.count(), 0); + double delta = Math.ceil(small.getStandardError()*size); + assertEquals((double) size, (double) mush.count(), delta); + } + } + } + } + + @Test + public void testHLLDenseDenseSquash() { + HyperLogLog p14HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(14).build(); + HyperLogLog p10HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(10).build(); + int size = 1_000_000; + for (int i = 0; i < size; i++) { + p14HLL.addLong(i); + } + + for (int i = 0; i < 10_000; i++) { + p10HLL.addLong(i); + } + + p14HLL.squash(p10HLL.getNumRegisterIndexBits()); + assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0); + } + + @Test + public void testHLLSparseDenseSquash() { + HyperLogLog p14HLL = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).setNumRegisterIndexBits(14).build(); + HyperLogLog p10HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(10).build(); + int size = 2000; + for (int i = 0; i < size; i++) { + p14HLL.addLong(i); + } + + for (int i = 0; i < 10_000; i++) { + p10HLL.addLong(i); + } + + p14HLL.squash(p10HLL.getNumRegisterIndexBits()); + assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0); + } } diff --git standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogMerge.java standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogMerge.java new file mode 100644 index 0000000..2007c6f --- /dev/null +++ standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogMerge.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.ndv.hll; + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.Collection; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class TestHyperLogLogMerge { + // 5% tolerance for estimated count + private float longRangeTolerance = 5.0f; + private float shortRangeTolerance = 2.0f; + + int size; + + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList(new Object[][] { + { 1_000 }, { 10_000 }, { 100_000 }, { 1_000_000 }, { 10_000_000 } + // { 100_000_000 }, { 1_000_000_000 } 1B passed but is super slow + }); + } + + public TestHyperLogLogMerge(int size) { + this.size = size; + } + + @Test + public void testHLLMergeDisjoint() { + HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = 0; i < size; i++) { + hll1.addLong(i); + } + HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = size; i < 2 * size; i++) { + hll2.addLong(i); + } + hll1.merge(hll2); + double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; + double delta = threshold * size / 100; + long expected = 2 * size; + long actual = hll1.count(); + assertEquals(expected, actual, delta); + } + + @Test + public void testHLLMerge25PercentOverlap() { + HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = 0; i < size; i++) { + hll1.addLong(i); + } + HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + int start = (int) (0.75 * size); + int end = (int) (size * 1.75); + for (int i = start; i < end; i++) { + hll2.addLong(i); + } + hll1.merge(hll2); + double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; + double delta = threshold * size / 100; + long expected = (long) (1.75 * size); + long actual = hll1.count(); + assertEquals(expected, actual, delta); + } + + @Test + public void testHLLMerge50PercentOverlap() { + HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = 0; i < size; i++) { + hll1.addLong(i); + } + HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + int start = (int) (0.5 * size); + int end = (int) (size * 1.5); + for (int i = start; i < end; i++) { + hll2.addLong(i); + } + hll1.merge(hll2); + double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; + double delta = threshold * size / 100; + long expected = (long) (1.5 * size); + long actual = hll1.count(); + assertEquals(expected, actual, delta); + } + + + @Test + public void testHLLMerge75PercentOverlap() { + HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = 0; i < size; i++) { + hll1.addLong(i); + } + HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + int start = (int) (0.25 * size); + int end = (int) (size * 1.25); + for (int i = start; i < end; i++) { + hll2.addLong(i); + } + hll1.merge(hll2); + double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; + double delta = threshold * size / 100; + long expected = (long) (1.25 * size); + long actual = hll1.count(); + assertEquals(expected, actual, delta); + } + + + @Test + public void testHLLMerge100PercentOverlap() { + HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = 0; i < size; i++) { + hll1.addLong(i); + } + HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); + for (int i = 0; i < size; i++) { + hll2.addLong(i); + } + hll1.merge(hll2); + double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; + double delta = threshold * size / 100; + long expected = size; + long actual = hll1.count(); + assertEquals(expected, actual, delta); + } + +}