From 0b100d9e1e3916bed2f387ac06688ec42b18e9c8 Mon Sep 17 00:00:00 2001 From: chenchang Date: Thu, 1 Nov 2018 14:36:59 +0800 Subject: [PATCH] KYLIN-3656 Improve HLLCounter performance --- .../kylin/measure/hllc/DenseRegister.java | 5 ++ .../apache/kylin/measure/hllc/HLLCounter.java | 27 ++++++-- .../kylin/measure/hllc/HLLCounterTest.java | 68 +++++++++++++++++++ 3 files changed, 93 insertions(+), 7 deletions(-) diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java index d34fef63a..5c192cc7f 100644 --- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java +++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java @@ -35,6 +35,11 @@ public class DenseRegister implements Register, java.io.Serializable { this.register = new byte[m]; } + public void copyFrom(DenseRegister r){ + assert m == r.m; + System.arraycopy(r.register, 0, register, 0 , register.length); + } + public void set(int pos, byte value) { register[pos] = value; } diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java index 80bbb2a9c..2b187d2b0 100644 --- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java +++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java @@ -32,6 +32,13 @@ import java.util.Map; @SuppressWarnings("serial") public class HLLCounter implements Serializable, Comparable { + static double[] harmonicMean; + static { + harmonicMean = new double[256]; + for (int i = 1; i < 256; i++) + harmonicMean[i] = 1.0 / (1L << i); + } + // not final for test purpose static double OVERFLOW_FACTOR = 0.01; @@ -57,7 +64,11 @@ public class HLLCounter implements Serializable, Comparable { public HLLCounter(HLLCounter another) { this(another.p, another.getRegisterType(), another.hashFunc); - merge(another); + if(another.getRegisterType() == RegisterType.DENSE){ + ((DenseRegister)register).copyFrom((DenseRegister)another.register); + }else { + merge(another); + } } public HLLCounter(int p, RegisterType type) { @@ -202,6 +213,8 @@ public class HLLCounter implements Serializable, Comparable { int zeroBuckets; public HLLCSnapshot(HLLCounter hllc) { + int[] registerNums = new int[256]; + p = (byte) hllc.p; registerSum = 0; zeroBuckets = 0; @@ -216,13 +229,13 @@ public class HLLCounter implements Serializable, Comparable { } byte[] registers = dr.getRawRegister(); for (int i = 0; i < hllc.m; i++) { - if (registers[i] == 0) { - registerSum++; - zeroBuckets++; - } else { - registerSum += 1.0 / (1L << registers[i]); - } + registerNums[registers[i]]++; } + zeroBuckets=registerNums[0]; + for (int i= 1; i < 256; i++) + registerSum += registerNums[i] * harmonicMean[i]; + + registerSum +=zeroBuckets; } public long getCountEstimate() { diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java index 92f2aab27..6affbbcbb 100644 --- a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java +++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.HashSet; import java.util.Random; import java.util.Set; @@ -68,6 +69,73 @@ public class HLLCounterTest { assertTrue(hllc.getCountEstimate() > 10 * 0.9); } + @Test + public void countPerformanceWithLargeCardinality(){ + HLLCounter hllc = generateTestCounter(2009, 1000*1000); + final int expectedEstimate = 1000664; // with this input the estimate will be 1000664 + final int testCount = 144*90; // 90(days) * 144(10 minute granularity) + countEstimatePerformance(hllc, expectedEstimate, testCount); + } + @Test + public void countPerformanceSmallCardinality(){ + HLLCounter hllc = generateTestCounter(2009, 300*1000); + final int expectedEstimate = 300603; // with this input the estimate will be 300603 + final int testCount = 144*90; // 90(days) * 144(10 minute granularity) + countEstimatePerformance(hllc, expectedEstimate, testCount); + } + + private void countEstimatePerformance(HLLCounter hllc, int expectedEstimate, int testCount) { + long start = System.currentTimeMillis(); + for (int i = 0; i < testCount; i++) + hllc.getCountEstimate(); + long totalTime = System.currentTimeMillis() - start; + System.out.println("count cost time : " + totalTime); + + long estimate = hllc.getCountEstimate(); + assertEquals(expectedEstimate, estimate); + System.out.println("estimate is " + estimate); + } + + private HLLCounter generateTestCounter(int seed, int maxDistinctCounts) { + long start = System.currentTimeMillis(); + Random rand1 = new Random(seed); + Set rawData = new HashSet<>(); + while (rawData.size() < maxDistinctCounts) + rawData.add(rand1.nextInt()); + ArrayList testData = new ArrayList<>(rawData); + assertEquals(maxDistinctCounts, testData.size()); + + HLLCounter hllc = new HLLCounter(16, RegisterType.DENSE); + + for (int j = 0; j < testData.size(); j++) { + hllc.add(testData.get(j)); + } + long totalTime = System.currentTimeMillis() - start; + System.out.println("generate data cost time : " + totalTime); + return hllc; + } + + @Test + public void createHLLCPerformance(){ + + HLLCounter hllc = generateTestCounter(2009, 1000*1000); + final int expectedEstimate = 1000664; // with this input the estimate will be 1000664 + final int testCount = 144*90; // 90(days) * 144(10 minute granularity) + + HLLCounter hllc2 = null; + long start = System.currentTimeMillis(); + for (int i = 0; i