diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java index 82085dd..33e97ed 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java @@ -25,11 +25,6 @@ private TreeMap sparseMap; - // for a better insertion performance values are added to temporary unsorted - // list which will be merged to sparse map after a threshold - private int[] tempList; - private int tempListIdx; - // number of register bits private final int p; @@ -47,8 +42,6 @@ public HLLSparseRegister(int p, int pp, int qp) { this.p = p; this.sparseMap = new TreeMap<>(); - this.tempList = new int[HLLConstants.TEMP_LIST_DEFAULT_SIZE]; - this.tempListIdx = 0; this.pPrime = pp; this.qPrime = qp; this.mask = ((1 << pPrime) - 1) ^ ((1 << p) - 1); @@ -57,46 +50,23 @@ public HLLSparseRegister(int p, int pp, int qp) { } public boolean add(long hashcode) { - boolean updated = false; - - // fill the temp list before merging to sparse map - if (tempListIdx < tempList.length) { - int encodedHash = encodeHash(hashcode); - tempList[tempListIdx++] = encodedHash; - updated = true; + boolean updated; + + int encodedHash = encodeHash(hashcode); + int key = encodedHash & pPrimeMask; + byte value = (byte) (encodedHash >>> pPrime); + byte nr; + // if MSB is set to 1 then next qPrime MSB bits contains the value of + // number of zeroes. + // if MSB is set to 0 then number of zeroes is contained within pPrime - p + // bits. + if (encodedHash < 0) { + nr = (byte) (value & qPrimeMask); } else { - updated = mergeTempListToSparseMap(); - } - - return updated; - } - - /** - * Adds temp list to sparse map. The key for sparse map entry is the register - * index determined by pPrime and value is the number of trailing zeroes. - * @return - */ - private boolean mergeTempListToSparseMap() { - boolean updated = false; - for (int i = 0; i < tempListIdx; i++) { - int encodedHash = tempList[i]; - int key = encodedHash & pPrimeMask; - byte value = (byte) (encodedHash >>> pPrime); - byte nr = 0; - // if MSB is set to 1 then next qPrime MSB bits contains the value of - // number of zeroes. - // if MSB is set to 0 then number of zeroes is contained within pPrime - p - // bits. - if (encodedHash < 0) { - nr = (byte) (value & qPrimeMask); - } else { - nr = (byte) (Integer.numberOfTrailingZeros(encodedHash >>> p) + 1); - } - updated = set(key, nr); + nr = (byte) (Integer.numberOfTrailingZeros(encodedHash >>> p) + 1); } + updated = set(key, nr); - // reset temp list index - tempListIdx = 0; return updated; } @@ -148,11 +118,6 @@ public int encodeHash(long hashcode) { } public int getSize() { - - // merge temp list before getting the size of sparse map - if (tempListIdx != 0) { - mergeTempListToSparseMap(); - } return sparseMap.size(); } @@ -175,13 +140,11 @@ public boolean set(int key, byte value) { boolean updated = false; // retain only the largest value for a register index - if (sparseMap.containsKey(key)) { - byte containedVal = sparseMap.get(key); - if (value > containedVal) { - sparseMap.put(key, value); - updated = true; - } - } else { + Byte containedVal = sparseMap.get(key); + if (containedVal == null) { + sparseMap.put(key, value); + updated = true; + } else if (value > containedVal) { sparseMap.put(key, value); updated = true; } @@ -192,13 +155,6 @@ public boolean set(int key, byte value) { return sparseMap; } - public TreeMap getMergedSparseMap() { - if (tempListIdx != 0) { - mergeTempListToSparseMap(); - } - return sparseMap; - } - public int getP() { return p; } @@ -230,16 +186,9 @@ public boolean equals(Object obj) { return false; } HLLSparseRegister other = (HLLSparseRegister) obj; - boolean result = p == other.p && pPrime == other.pPrime && qPrime == other.qPrime - && tempListIdx == other.tempListIdx; + boolean result = p == other.p && pPrime == other.pPrime && qPrime == other.qPrime; if (result) { - for (int i = 0; i < tempListIdx; i++) { - if (tempList[i] != other.tempList[i]) { - return false; - } - } - - result = result && sparseMap.equals(other.sparseMap); + result = sparseMap.equals(other.sparseMap); } return result; } @@ -250,9 +199,6 @@ public int hashCode() { hashcode += 31 * p; hashcode += 31 * pPrime; hashcode += 31 * qPrime; - for (int i = 0; i < tempListIdx; i++) { - hashcode += 31 * tempList[tempListIdx]; - } hashcode += sparseMap.hashCode(); return hashcode; } diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java index 8bdb47b..e95e211 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java @@ -278,7 +278,8 @@ public void add(long hashcode) { public long estimateNumDistinctValues() { // FMSketch treats the ndv of all nulls as 1 but hll treates the ndv as 0. // In order to get rid of divide by 0 problem, we follow FMSketch - return count() > 0 ? count() : 1; + long count = count(); + return count > 0 ? count : 1; } public long count() {