entry : sparseRegister.getSparseMap().entrySet()) {
+ int key = entry.getKey();
+ int idx = key & pMask;
+ result.set(idx, entry.getValue());
+ }
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Encoding: ");
+ sb.append(encoding);
+ sb.append(", p: ");
+ sb.append(p);
+ sb.append(", estimatedCardinality: ");
+ sb.append(estimateNumDistinctValues());
+ return sb.toString();
+ }
+
+ public String toStringExtended() {
+ if (encoding.equals(EncodingType.DENSE)) {
+ return toString() + ", " + denseRegister.toExtendedString();
+ } else if (encoding.equals(EncodingType.SPARSE)) {
+ return toString() + ", " + sparseRegister.toExtendedString();
+ }
+
+ return toString();
+ }
+
+ public int getNumRegisterIndexBits() {
+ return p;
+ }
+
+ public EncodingType getEncoding() {
+ return encoding;
+ }
+
+ public void setEncoding(EncodingType encoding) {
+ this.encoding = encoding;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof HyperLogLog)) {
+ return false;
+ }
+
+ HyperLogLog other = (HyperLogLog) obj;
+ long count = estimateNumDistinctValues();
+ long otherCount = other.estimateNumDistinctValues();
+ boolean result = p == other.p && chosenHashBits == other.chosenHashBits
+ && encoding.equals(other.encoding) && count == otherCount;
+ if (encoding.equals(EncodingType.DENSE)) {
+ result = result && denseRegister.equals(other.getHLLDenseRegister());
+ }
+
+ if (encoding.equals(EncodingType.SPARSE)) {
+ result = result && sparseRegister.equals(other.getHLLSparseRegister());
+ }
+ return result;
+ }
+
+ @Override
+ public int hashCode() {
+ int hashcode = 0;
+ hashcode += 31 * p;
+ hashcode += 31 * chosenHashBits;
+ hashcode += encoding.hashCode();
+ hashcode += 31 * estimateNumDistinctValues();
+ if (encoding.equals(EncodingType.DENSE)) {
+ hashcode += 31 * denseRegister.hashCode();
+ }
+
+ if (encoding.equals(EncodingType.SPARSE)) {
+ hashcode += 31 * sparseRegister.hashCode();
+ }
+ return hashcode;
+ }
+
+ @Override
+ public void reset() {
+ }
+
+ @Override
+ public String serialize() {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ // write bytes to bos ...
+ try {
+ HyperLogLogUtils.serializeHLL(bos, this);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return Base64.encodeBase64String(bos.toByteArray());
+ }
+
+ @Override
+ public NumDistinctValueEstimator deserialize(String s) {
+ InputStream is = new ByteArrayInputStream(Base64.decodeBase64(s));
+ try {
+ return HyperLogLogUtils.deserializeHLL(is);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void addToEstimator(long v) {
+ addLong(v);
+ }
+
+ @Override
+ public void addToEstimator(String s) {
+ addString(s);
+ }
+
+ @Override
+ public void addToEstimator(double d) {
+ addDouble(d);
+ }
+
+ @Override
+ public void addToEstimator(HiveDecimal decimal) {
+ addDouble(decimal.doubleValue());
+ }
+
+ @Override
+ public void mergeEstimators(NumDistinctValueEstimator o) {
+ merge((HyperLogLog) o);
+ }
+
+ @Override
+ public int lengthFor(JavaDataModel model) {
+ // 5 is the head, 1<
+ * |-4 byte-|------varlong----|varint (optional)|----------|
+ * ---------------------------------------------------------
+ * | header | estimated-count | register-length | register |
+ * ---------------------------------------------------------
+ *
+ * 4 byte header is encoded like below
+ * 3 bytes - HLL magic string to identify serialized stream
+ * 4 bits - p (number of bits to be used as register index)
+ * 1 - spare bit (not used)
+ * 3 bits - encoding (000 - sparse, 001..110 - n bit packing, 111 - no bit packing)
+ *
+ * Followed by header are 3 fields that are required for reconstruction
+ * of hyperloglog
+ * Estimated count - variable length long to store last computed estimated count.
+ * This is just for quick lookup without deserializing registers
+ * Register length - number of entries in the register (required only for
+ * for sparse representation. For bit-packing, the register
+ * length can be found from p)
+ *
+ * @param out
+ * - output stream to write to
+ * @param hll
+ * - hyperloglog that needs to be serialized
+ * @throws IOException
+ */
+ public static void serializeHLL(OutputStream out, HyperLogLog hll) throws IOException {
+
+ // write header
+ out.write(MAGIC);
+ int fourthByte = 0;
+ int p = hll.getNumRegisterIndexBits();
+ fourthByte = (p & 0xff) << 4;
+
+ int bitWidth = 0;
+ EncodingType enc = hll.getEncoding();
+
+ // determine bit width for bitpacking and encode it in header
+ if (enc.equals(EncodingType.DENSE)) {
+ int lzr = hll.getHLLDenseRegister().getMaxRegisterValue();
+ bitWidth = getBitWidth(lzr);
+
+ // the max value of number of zeroes for 64 bit hash can be encoded using
+ // only 6 bits. So we will disable bit packing for any values >6
+ if (bitWidth > 6) {
+ fourthByte |= 7;
+ bitWidth = 8;
+ } else {
+ fourthByte |= (bitWidth & 7);
+ }
+ }
+
+ // write fourth byte of header
+ out.write(fourthByte);
+
+ // write estimated count
+ long estCount = hll.estimateNumDistinctValues();
+ writeVulong(out, estCount);
+
+ // serialize dense/sparse registers. Dense registers are bitpacked whereas
+ // sparse registers are delta and variable length encoded
+ if (enc.equals(EncodingType.DENSE)) {
+ byte[] register = hll.getHLLDenseRegister().getRegister();
+ bitpackHLLRegister(out, register, bitWidth);
+ } else if (enc.equals(EncodingType.SPARSE)) {
+ TreeMap sparseMap = hll.getHLLSparseRegister().getSparseMap();
+
+ // write the number of elements in sparse map (required for
+ // reconstruction)
+ writeVulong(out, sparseMap.size());
+
+ // compute deltas and write the values as varints
+ int prev = 0;
+ for (Map.Entry entry : sparseMap.entrySet()) {
+ if (prev == 0) {
+ prev = (entry.getKey() << HLLConstants.Q_PRIME_VALUE) | entry.getValue();
+ writeVulong(out, prev);
+ } else {
+ int curr = (entry.getKey() << HLLConstants.Q_PRIME_VALUE) | entry.getValue();
+ int delta = curr - prev;
+ writeVulong(out, delta);
+ prev = curr;
+ }
+ }
+ }
+ }
+
+ /**
+ * Refer serializeHLL() for format of serialization. This funtions
+ * deserializes the serialized hyperloglogs
+ * @param in
+ * - input stream
+ * @return deserialized hyperloglog
+ * @throws IOException
+ */
+ public static HyperLogLog deserializeHLL(InputStream in) throws IOException {
+ checkMagicString(in);
+ int fourthByte = in.read() & 0xff;
+ int p = fourthByte >>> 4;
+
+ // read type of encoding
+ int enc = fourthByte & 7;
+ EncodingType encoding = null;
+ int bitSize = 0;
+ if (enc == 0) {
+ encoding = EncodingType.SPARSE;
+ } else if (enc > 0 && enc < 7) {
+ bitSize = enc;
+ encoding = EncodingType.DENSE;
+ } else {
+ // bit packing disabled
+ bitSize = 8;
+ encoding = EncodingType.DENSE;
+ }
+
+ // estimated count
+ long estCount = readVulong(in);
+
+ HyperLogLog result = null;
+ if (encoding.equals(EncodingType.SPARSE)) {
+ result = HyperLogLog.builder().setNumRegisterIndexBits(p)
+ .setEncoding(EncodingType.SPARSE).build();
+ int numRegisterEntries = (int) readVulong(in);
+ int[] reg = new int[numRegisterEntries];
+ int prev = 0;
+
+ // reconstruct the sparse map from delta encoded and varint input stream
+ if (numRegisterEntries > 0) {
+ prev = (int) readVulong(in);
+ reg[0] = prev;
+ }
+ int delta = 0;
+ int curr = 0;
+ for (int i = 1; i < numRegisterEntries; i++) {
+ delta = (int) readVulong(in);
+ curr = prev + delta;
+ reg[i] = curr;
+ prev = curr;
+ }
+ result.setHLLSparseRegister(reg);
+ } else {
+
+ // explicitly disable bit packing
+ if (bitSize == 8) {
+ result = HyperLogLog.builder().setNumRegisterIndexBits(p)
+ .setEncoding(EncodingType.DENSE).enableBitPacking(false).build();
+ } else {
+ result = HyperLogLog.builder().setNumRegisterIndexBits(p)
+ .setEncoding(EncodingType.DENSE).enableBitPacking(true).build();
+ }
+ int m = 1 << p;
+ byte[] register = unpackHLLRegister(in, m, bitSize);
+ result.setHLLDenseRegister(register);
+ }
+
+ result.setCount(estCount);
+
+ return result;
+ }
+
+ private static void bitpackHLLRegister(OutputStream out, byte[] register, int bitWidth)
+ throws IOException {
+ int bitsLeft = 8;
+ byte current = 0;
+
+ if (bitWidth == 8) {
+ fastPathWrite(out, register);
+ return;
+ }
+
+ // write the blob
+ for (byte value : register) {
+ int bitsToWrite = bitWidth;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= value >>> (bitsToWrite - bitsLeft);
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (1 << bitsToWrite) - 1;
+ out.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ bitsLeft -= bitsToWrite;
+ current |= value << bitsLeft;
+ if (bitsLeft == 0) {
+ out.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ }
+
+ out.flush();
+ }
+
+ private static void fastPathWrite(OutputStream out, byte[] register) throws IOException {
+ for (byte b : register) {
+ out.write(b);
+ }
+ }
+
+ /**
+ * Unpack the bitpacked HyperLogLog register.
+ * @param in
+ * - input stream
+ * @param length
+ * - serialized length
+ * @return unpacked HLL register
+ * @throws IOException
+ */
+ private static byte[] unpackHLLRegister(InputStream in, int length, int bitSize)
+ throws IOException {
+ int mask = (1 << bitSize) - 1;
+ int bitsLeft = 8;
+
+ if (bitSize == 8) {
+ return fastPathRead(in, length);
+ }
+
+ byte current = (byte) (0xff & in.read());
+
+ byte[] output = new byte[length];
+ for (int i = 0; i < output.length; i++) {
+ byte result = 0;
+ int bitsLeftToRead = bitSize;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= current & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ current = (byte) (0xff & in.read());
+ bitsLeft = 8;
+ }
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= bitsLeftToRead;
+ result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ output[i] = (byte) (result & mask);
+ }
+ return output;
+ }
+
+ private static byte[] fastPathRead(InputStream in, int length) throws IOException {
+ byte[] result = new byte[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = (byte) in.read();
+ }
+ return result;
+ }
+
+ /**
+ * Get estimated cardinality without deserializing HLL
+ * @param in
+ * - serialized HLL
+ * @return - cardinality
+ * @throws IOException
+ */
+ public static long getEstimatedCountFromSerializedHLL(InputStream in) throws IOException {
+ checkMagicString(in);
+ in.read();
+ return readVulong(in);
+ }
+
+ /**
+ * Check if the specified input stream is actually a HLL stream
+ * @param in
+ * - input stream
+ * @throws IOException
+ */
+ private static void checkMagicString(InputStream in) throws IOException {
+ byte[] magic = new byte[3];
+ magic[0] = (byte) in.read();
+ magic[1] = (byte) in.read();
+ magic[2] = (byte) in.read();
+
+ if (!Arrays.equals(magic, MAGIC)) {
+ throw new IllegalArgumentException("The input stream is not a HyperLogLog stream.");
+ }
+ }
+
+ /**
+ * Minimum bits required to encode the specified value
+ * @param val
+ * - input value
+ * @return
+ */
+ private static int getBitWidth(int val) {
+ int count = 0;
+ while (val != 0) {
+ count++;
+ val = (byte) (val >>> 1);
+ }
+ return count;
+ }
+
+ /**
+ * Return relative error between actual and estimated cardinality
+ * @param actualCount
+ * - actual count
+ * @param estimatedCount
+ * - estimated count
+ * @return relative error
+ */
+ public static float getRelativeError(long actualCount, long estimatedCount) {
+ float err = (1.0f - ((float) estimatedCount / (float) actualCount)) * 100.0f;
+ return err;
+ }
+
+ /**
+ * Write variable length encoded longs to output stream
+ * @param output
+ * - out stream
+ * @param value
+ * - long
+ * @throws IOException
+ */
+ private static void writeVulong(OutputStream output, long value) throws IOException {
+ while (true) {
+ if ((value & ~0x7f) == 0) {
+ output.write((byte) value);
+ return;
+ } else {
+ output.write((byte) (0x80 | (value & 0x7f)));
+ value >>>= 7;
+ }
+ }
+ }
+
+ /**
+ * Read variable length encoded longs from input stream
+ * @param in
+ * - input stream
+ * @return decoded long value
+ * @throws IOException
+ */
+ private static long readVulong(InputStream in) throws IOException {
+ long result = 0;
+ long b;
+ int offset = 0;
+ do {
+ b = in.read();
+ if (b == -1) {
+ throw new EOFException("Reading Vulong past EOF");
+ }
+ result |= (0x7f & b) << offset;
+ offset += 7;
+ } while (b >= 0x80);
+ return result;
+ }
+
+}
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index c7afe2bc4a..9c954be978 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1724,7 +1724,9 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal
HIVE_STATS_COLLECT_SCANCOLS("hive.stats.collect.scancols", false,
"Whether column accesses are tracked in the QueryPlan.\n" +
"This is useful to identify how tables are accessed and to determine if there are wasted columns that can be trimmed."),
- // standard error allowed for ndv estimates. A lower value indicates higher accuracy and a
+ HIVE_STATS_NDV_ALGO("hive.stats.ndv.algo", "hll", new PatternSet("hll", "fm"),
+ "hll and fm stand for HyperLogLog and FM-sketch, respectively for computing ndv."),
+ // standard error allowed for ndv estimates for FM-sketch. A lower value indicates higher accuracy and a
// higher compute cost.
HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0,
"Standard error expressed in percentage. Provides a tradeoff between accuracy and compute cost. \n" +
diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHLLNoBias.java b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHLLNoBias.java
new file mode 100644
index 0000000000..30f5ca3e61
--- /dev/null
+++ b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHLLNoBias.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.ndv.hll;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class TestHLLNoBias {
+
+ // 1.5% tolerance for long range bias (when no bias enabled) and 5% when (no
+ // bias is disabled) and
+ // 0.5% for short range bias
+ private float noBiaslongRangeTolerance = 2.0f;
+ private float biasedlongRangeTolerance = 5.0f;
+ private float shortRangeTolerance = 0.5f;
+
+ private int size;
+
+ public TestHLLNoBias(int n) {
+ this.size = n;
+ }
+
+ @Parameters
+ public static Collection data() {
+ Object[][] data = new Object[][] { { 30000 }, { 41000 }, { 50000 }, { 60000 }, { 75000 },
+ { 80000 }, { 81920 } };
+ return Arrays.asList(data);
+ }
+
+ @Test
+ public void testHLLAdd() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().build();
+ int size = 100;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ double threshold = size > 40000 ? noBiaslongRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ }
+
+ @Test
+ public void testHLLAddHalfDistinct() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().build();
+ int unique = size / 2;
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ long val = rand.nextInt(unique);
+ hashset.add(val);
+ hll.addLong(val);
+ }
+ double threshold = size > 40000 ? noBiaslongRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ assertEquals((double) hashset.size(), (double) hll.count(), delta);
+ }
+
+ @Test
+ public void testHLLNoBiasDisabled() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().enableNoBias(false).build();
+ int size = 100;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ double threshold = size > 40000 ? biasedlongRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ }
+
+ @Test
+ public void testHLLNoBiasDisabledHalfDistinct() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().enableNoBias(false).build();
+ int unique = size / 2;
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ long val = rand.nextInt(unique);
+ hashset.add(val);
+ hll.addLong(val);
+ }
+ double threshold = size > 40000 ? biasedlongRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ assertEquals((double) hashset.size(), (double) hll.count(), delta);
+ }
+
+}
diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHLLSerialization.java b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHLLSerialization.java
new file mode 100644
index 0000000000..b4b8df1174
--- /dev/null
+++ b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHLLSerialization.java
@@ -0,0 +1,267 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.ndv.hll;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.EncodingType;
+import org.junit.After;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class TestHLLSerialization {
+
+ private int size;
+ private File testFile;
+ private static final String pathPrefix = ".";
+ private static final int SEED = 100;
+ // 5% tolerance for long range bias and 2.5% for short range bias
+ private float longRangeTolerance = 5.0f;
+ private float shortRangeTolerance = 2.5f;
+
+ public TestHLLSerialization(int n) {
+ this.size = n;
+ this.testFile = new File(pathPrefix + testCaseName.getMethodName() + "_" + size + ".hll");
+ }
+
+ @Parameters
+ public static Collection data() {
+ Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 2000 }, { 3000 },
+ { 5000 }, { 6000 }, { 10000 }, { 100000 }, { 1000000 } };
+ return Arrays.asList(data);
+ }
+
+ @After
+ public void close() {
+ if (testFile.exists()) {
+ testFile.delete();
+ }
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Test
+ public void testHLLSparseSerialization() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ Random rand = new Random(SEED);
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ }
+
+ @Test
+ public void testHLLSparseSerializationHalfDistinct() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ Random rand = new Random(SEED);
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ int val = rand.nextInt(size / 2);
+ hll.addLong(val);
+ hashset.add(val);
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ assertEquals(hashset.size(), hll.count(), delta);
+ assertEquals(hashset.size(), deserializedHLL.count(), delta);
+ }
+
+ @Test
+ public void testHLLSparseNoBitPacking() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE)
+ .enableBitPacking(false).build();
+ Random rand = new Random(SEED);
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ }
+
+ @Test
+ public void testHLLSparseNoBitPackingHalfDistinct() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE)
+ .enableBitPacking(false).build();
+ Random rand = new Random(SEED);
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ int val = rand.nextInt(size / 2);
+ hll.addLong(val);
+ hashset.add(val);
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ assertEquals(hashset.size(), hll.count(), delta);
+ assertEquals(hashset.size(), deserializedHLL.count(), delta);
+ }
+
+ @Test
+ public void testHLLDenseSerialization() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ Random rand = new Random(SEED);
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ }
+
+ @Test
+ public void testHLLDenseSerializationHalfDistinct() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ Random rand = new Random(SEED);
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ int val = rand.nextInt(size / 2);
+ hll.addLong(val);
+ hashset.add(val);
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ assertEquals(hashset.size(), hll.count(), delta);
+ assertEquals(hashset.size(), deserializedHLL.count(), delta);
+ }
+
+ @Test
+ public void testHLLDenseNoBitPacking() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false)
+ .build();
+ Random rand = new Random(SEED);
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ }
+
+ @Test
+ public void testHLLDenseNoBitPackingHalfDistinct() throws IOException {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false)
+ .build();
+ Random rand = new Random(SEED);
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ int val = rand.nextInt(size / 2);
+ hll.addLong(val);
+ hashset.add(val);
+ }
+ FileOutputStream fos = new FileOutputStream(testFile);
+ DataOutputStream out = new DataOutputStream(fos);
+ HyperLogLogUtils.serializeHLL(out, hll);
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ FileInputStream fis = new FileInputStream(testFile);
+ DataInputStream in = new DataInputStream(fis);
+ HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in);
+ assertEquals(hll, deserializedHLL);
+ assertEquals(hll.toString(), deserializedHLL.toString());
+ assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended());
+ assertEquals(hll.hashCode(), deserializedHLL.hashCode());
+ assertEquals(hll.count(), deserializedHLL.count());
+ assertEquals(hashset.size(), hll.count(), delta);
+ assertEquals(hashset.size(), deserializedHLL.count(), delta);
+ }
+}
diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
new file mode 100644
index 0000000000..635073fc26
--- /dev/null
+++ b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java
@@ -0,0 +1,227 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.ndv.hll;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.EncodingType;
+import org.junit.Test;
+
+public class TestHyperLogLog {
+ // 5% tolerance for estimated count
+ private float longRangeTolerance = 5.0f;
+ private float shortRangeTolerance = 2.0f;
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testHLLDenseMerge() {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16)
+ .setEncoding(EncodingType.DENSE).build();
+ int size = 1000;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(i);
+ hll2.addLong(size + i);
+ hll3.addLong(2 * size + i);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ assertEquals((double) size, (double) hll2.count(), delta);
+
+ // merge
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // merge should update registers and hence the count
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // new merge
+ hll.merge(hll3);
+ assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // invalid merge -- register set size doesn't match
+ hll.merge(hll4);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testHLLSparseMerge() {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16)
+ .setEncoding(EncodingType.SPARSE).build();
+ int size = 500;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(i);
+ hll2.addLong(size + i);
+ hll3.addLong(2 * size + i);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ assertEquals((double) size, (double) hll2.count(), delta);
+
+ // merge
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // merge should update registers and hence the count
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // new merge
+ hll.merge(hll3);
+ assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // invalid merge -- register set size doesn't match
+ hll.merge(hll4);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testHLLSparseDenseMerge() {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16)
+ .setEncoding(EncodingType.DENSE).build();
+ int size = 1000;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(i);
+ hll2.addLong(size + i);
+ hll3.addLong(2 * size + i);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ assertEquals((double) size, (double) hll2.count(), delta);
+
+ // sparse-sparse merge
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // merge should update registers and hence the count
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // sparse-dense merge
+ hll.merge(hll3);
+ assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // invalid merge -- register set size doesn't match
+ hll.merge(hll4);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testHLLDenseSparseMerge() {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build();
+ HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16)
+ .setEncoding(EncodingType.SPARSE).build();
+ int size = 1000;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(i);
+ hll2.addLong(size + i);
+ hll3.addLong(2 * size + i);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ assertEquals((double) size, (double) hll2.count(), delta);
+
+ // sparse-sparse merge
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // merge should update registers and hence the count
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // sparse-dense merge
+ hll.merge(hll3);
+ assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // invalid merge -- register set size doesn't match
+ hll.merge(hll4);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testHLLSparseOverflowMerge() {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build();
+ HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16)
+ .setEncoding(EncodingType.SPARSE).build();
+ int size = 1000;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(i);
+ hll2.addLong(size + i);
+ hll3.addLong(2 * size + i);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ assertEquals((double) size, (double) hll2.count(), delta);
+
+ // sparse-sparse merge
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // merge should update registers and hence the count
+ hll.merge(hll2);
+ assertEquals((double) 2 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.SPARSE, hll.getEncoding());
+
+ // sparse-sparse overload to dense
+ hll.merge(hll3);
+ assertEquals((double) 3 * size, (double) hll.count(), delta);
+ assertEquals(EncodingType.DENSE, hll.getEncoding());
+
+ // invalid merge -- register set size doesn't match
+ hll.merge(hll4);
+ }
+
+ @Test
+ public void testHLLSparseMoreRegisterBits() {
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE)
+ .setNumRegisterIndexBits(16).build();
+ int size = 1000;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(i);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ }
+}
diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogDense.java b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogDense.java
new file mode 100644
index 0000000000..00fd785b6f
--- /dev/null
+++ b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogDense.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.ndv.hll;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class TestHyperLogLogDense {
+
+ // 5% tolerance for long range bias and 3% for short range bias
+ private float longRangeTolerance = 5.0f;
+ private float shortRangeTolerance = 3.0f;
+
+ private int size;
+
+ public TestHyperLogLogDense(int n) {
+ this.size = n;
+ }
+
+ @Parameters
+ public static Collection data() {
+ Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 10000 }, { 100000 },
+ { 1000000 } };
+ return Arrays.asList(data);
+ }
+
+ @Test
+ public void testHLLAdd() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build();
+ int size = 100;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ }
+
+ @Test
+ public void testHLLAddHalfDistinct() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build();
+ int unique = size / 2;
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ long val = rand.nextInt(unique);
+ hashset.add(val);
+ hll.addLong(val);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ assertEquals((double) hashset.size(), (double) hll.count(), delta);
+ }
+
+}
diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogSparse.java b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogSparse.java
new file mode 100644
index 0000000000..cfa58868e5
--- /dev/null
+++ b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLogSparse.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.ndv.hll;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class TestHyperLogLogSparse {
+
+ // 5% tolerance for long range bias and 1% for short range bias
+ private float longRangeTolerance = 5.0f;
+ private float shortRangeTolerance = 1.0f;
+
+ private int size;
+
+ public TestHyperLogLogSparse(int n) {
+ this.size = n;
+ }
+
+ @Parameters
+ public static Collection data() {
+ Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 10000 }, { 100000 },
+ { 1000000 } };
+ return Arrays.asList(data);
+ }
+
+ @Test
+ public void testHLLAdd() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().build();
+ int size = 100;
+ for (int i = 0; i < size; i++) {
+ hll.addLong(rand.nextLong());
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * size / 100;
+ assertEquals((double) size, (double) hll.count(), delta);
+ }
+
+ @Test
+ public void testHLLAddHalfDistinct() {
+ Random rand = new Random(size);
+ HyperLogLog hll = HyperLogLog.builder().build();
+ int unique = size / 2;
+ Set hashset = new HashSet();
+ for (int i = 0; i < size; i++) {
+ long val = rand.nextInt(unique);
+ hashset.add(val);
+ hll.addLong(val);
+ }
+ double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance;
+ double delta = threshold * hashset.size() / 100;
+ assertEquals((double) hashset.size(), (double) hll.count(), delta);
+ }
+}
diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestSparseEncodeHash.java b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestSparseEncodeHash.java
new file mode 100644
index 0000000000..2c7e89b5e6
--- /dev/null
+++ b/common/src/test/org/apache/hadoop/hive/common/ndv/hll/TestSparseEncodeHash.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.ndv.hll;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class TestSparseEncodeHash {
+
+ private long input;
+ private int expected;
+
+ public TestSparseEncodeHash(long i, int e) {
+ this.input = i;
+ this.expected = e;
+ }
+
+ @Parameters
+ public static Collection data() {
+ Object[][] data = new Object[][] { { 11111111111L, 373692871 },
+ { 4314495982023L, -1711269433 }, { 4314529536455L, -1744823865 },
+ { 4314563074503L, 268425671 }, { 17257983908295L, -1644160569 }, { 536861127L, 536861127 },
+ { 536844743L, 536844743 }, { 144115188075862471L, -671082041 } };
+ return Arrays.asList(data);
+ }
+
+ @Test
+ public void testEncodeHash() {
+ HLLSparseRegister reg = new HLLSparseRegister(14, 25, 6);
+ int got = reg.encodeHash(input);
+ assertEquals(expected, got);
+ }
+}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java
deleted file mode 100644
index 92f9a845e3..0000000000
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java
+++ /dev/null
@@ -1,367 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.metastore;
-import java.util.Random;
-
-import javolution.util.FastBitSet;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.io.Text;
-
-/*
- * https://en.wikipedia.org/wiki/Flajolet%E2%80%93Martin_algorithm
- * We implement Flajolet–Martin algorithm in this class.
- * The Flajolet–Martin algorithm is an algorithm for approximating the number of distinct elements
- * in a stream with a single pass and space-consumption which is logarithmic in the maximum number
- * of possible distinct elements in the stream. The algorithm was introduced by Philippe Flajolet
- * and G. Nigel Martin in their 1984 paper "Probabilistic Counting Algorithms for Data Base Applications".
- * Later it has been refined in the papers "LogLog counting of large cardinalities" by Marianne Durand
- * and Philippe Flajolet, and "HyperLogLog: The analysis of a near-optimal cardinality estimation
- * algorithm" by Philippe Flajolet et al.
- */
-
-/*
- * The algorithm works like this.
- * (1) Set the number of bit vectors, i.e., numBitVectors, based on the precision.
- * (2) For each bit vector, generate hash value of the long value and mod it by 2^bitVectorSize-1. (addToEstimator)
- * (3) Set the index (addToEstimator)
- * (4) Take the average of the index for all the bit vectors and get the estimated NDV (estimateNumDistinctValues).
- */
-public class NumDistinctValueEstimator {
-
- static final Log LOG = LogFactory.getLog(NumDistinctValueEstimator.class.getName());
-
- /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number.
- * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1.
- * If a,b,x didn't come from a finite field ax1 + b mod k and ax2 + b mod k will not be pair wise
- * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1
- * thus introducing errors in the estimates.
- */
- private static final int BIT_VECTOR_SIZE = 31;
- private final int numBitVectors;
-
- // Refer to Flajolet-Martin'86 for the value of phi
- private static final double PHI = 0.77351;
-
- private final int[] a;
- private final int[] b;
- private final FastBitSet[] bitVector;
-
- private final Random aValue;
- private final Random bValue;
-
- /* Create a new distinctValueEstimator
- */
- public NumDistinctValueEstimator(int numBitVectors) {
- this.numBitVectors = numBitVectors;
- bitVector = new FastBitSet[numBitVectors];
- for (int i=0; i< numBitVectors; i++) {
- bitVector[i] = new FastBitSet(BIT_VECTOR_SIZE);
- }
-
- a = new int[numBitVectors];
- b = new int[numBitVectors];
-
- /* Use a large prime number as a seed to the random number generator.
- * Java's random number generator uses the Linear Congruential Generator to generate random
- * numbers using the following recurrence relation,
- *
- * X(n+1) = (a X(n) + c ) mod m
- *
- * where X0 is the seed. Java implementation uses m = 2^48. This is problematic because 2^48
- * is not a prime number and hence the set of numbers from 0 to m don't form a finite field.
- * If these numbers don't come from a finite field any give X(n) and X(n+1) may not be pair
- * wise independent.
- *
- * However, empirically passing in prime numbers as seeds seems to work better than when passing
- * composite numbers as seeds. Ideally Java's Random should pick m such that m is prime.
- *
- */
- aValue = new Random(99397);
- bValue = new Random(9876413);
-
- for (int i = 0; i < numBitVectors; i++) {
- int randVal;
- /* a and b shouldn't be even; If a and b are even, then none of the values
- * will set bit 0 thus introducing errors in the estimate. Both a and b can be even
- * 25% of the times and as a result 25% of the bit vectors could be inaccurate. To avoid this
- * always pick odd values for a and b.
- */
- do {
- randVal = aValue.nextInt();
- } while (randVal % 2 == 0);
-
- a[i] = randVal;
-
- do {
- randVal = bValue.nextInt();
- } while (randVal % 2 == 0);
-
- b[i] = randVal;
-
- if (a[i] < 0) {
- a[i] = a[i] + (1 << BIT_VECTOR_SIZE - 1);
- }
-
- if (b[i] < 0) {
- b[i] = b[i] + (1 << BIT_VECTOR_SIZE - 1);
- }
- }
- }
-
- public NumDistinctValueEstimator(String s, int numBitVectors) {
- this.numBitVectors = numBitVectors;
- FastBitSet bitVectorDeser[] = deserialize(s, numBitVectors);
- bitVector = new FastBitSet[numBitVectors];
- for(int i=0; i = '0' && c <= '9') {
- String t = new String();
- t = t + c;
- c = s.charAt(i);
- i = i + 1;
-
- while (c != ',' && c!= '}') {
- t = t + c;
- c = s.charAt(i);
- i = i + 1;
- }
-
- int bitIndex = Integer.parseInt(t);
- assert(bitIndex >= 0);
- assert(vectorIndex < numBitVectors);
- b[vectorIndex].set(bitIndex);
- if (c == '}') {
- vectorIndex = vectorIndex + 1;
- }
- }
- }
- return b;
- }
-
- private int generateHash(long v, int hashNum) {
- int mod = (1<> 1;
- }
-
- // Set bitvector[index] := 1
- bitVector[i].set(index);
- }
- }
-
- public void addToEstimatorPCSA(long v) {
- int hash = generateHashForPCSA(v);
- int rho = hash/numBitVectors;
- int index;
-
- // Find the index of the least significant bit that is 1
- for (index=0; index> 1;
- }
-
- // Set bitvector[index] := 1
- bitVector[hash%numBitVectors].set(index);
- }
-
- public void addToEstimator(double d) {
- int v = new Double(d).hashCode();
- addToEstimator(v);
- }
-
- public void addToEstimatorPCSA(double d) {
- int v = new Double(d).hashCode();
- addToEstimatorPCSA(v);
- }
-
- public void addToEstimator(HiveDecimal decimal) {
- int v = decimal.hashCode();
- addToEstimator(v);
- }
-
- public void addToEstimatorPCSA(HiveDecimal decimal) {
- int v = decimal.hashCode();
- addToEstimatorPCSA(v);
- }
-
- public void mergeEstimators(NumDistinctValueEstimator o) {
- // Bitwise OR the bitvector with the bitvector in the agg buffer
- for (int i=0; i() {
@Override
public AggrStats load(StatsCacheKey key) throws Exception {
- int numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION);
HBaseReadWrite hrw = HBaseReadWrite.getInstance();
AggrStats aggrStats = hrw.getAggregatedStats(key.hashed);
@@ -101,7 +100,7 @@ public AggrStats load(StatsCacheKey key) throws Exception {
if (aggregator == null) {
aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css.iterator()
.next().getStatsObj().iterator().next().getStatsData().getSetField(),
- numBitVectors, useDensityFunctionForNDVEstimation);
+ useDensityFunctionForNDVEstimation);
}
ColumnStatisticsObj statsObj = aggregator
.aggregate(key.colName, key.partNames, css);
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java
index 31955b4363..29a05390bf 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java
@@ -26,7 +26,6 @@
import org.apache.hadoop.hive.metastore.api.MetaException;
public abstract class ColumnStatsAggregator {
- public int numBitVectors;
public boolean useDensityFunctionForNDVEstimation;
public abstract ColumnStatisticsObj aggregate(String colName, List partNames,
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java
index daf85692eb..568bf0609b 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java
@@ -34,7 +34,7 @@
private ColumnStatsAggregatorFactory() {
}
- public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, int numBitVectors, boolean useDensityFunctionForNDVEstimation) {
+ public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, boolean useDensityFunctionForNDVEstimation) {
ColumnStatsAggregator agg;
switch (type) {
case BOOLEAN_STATS:
@@ -58,7 +58,6 @@ public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, int n
default:
throw new RuntimeException("Woh, bad. Unknown stats type " + type.toString());
}
- agg.numBitVectors = numBitVectors;
agg.useDensityFunctionForNDVEstimation = useDensityFunctionForNDVEstimation;
return agg;
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java
index 36b2c9c56b..8eb64e0143 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java
@@ -26,7 +26,8 @@
import java.util.List;
import java.util.Map;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.StatObjectConverter;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
@@ -46,7 +47,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// check if all the ColumnStatisticsObjs contain stats and all the ndv are
// bitvectors
boolean doAllPartitionContainStats = partNames.size() == css.size();
- boolean isNDVBitVectorSet = true;
+ NumDistinctValueEstimator ndvEstimator = null;
String colType = null;
for (ColumnStatistics cs : css) {
if (cs.getStatsObjSize() != 1) {
@@ -60,22 +61,36 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso
.getStatsData().getSetField());
}
- if (numBitVectors <= 0 || !cso.getStatsData().getDecimalStats().isSetBitVectors()
+ if (!cso.getStatsData().getDecimalStats().isSetBitVectors()
|| cso.getStatsData().getDecimalStats().getBitVectors().length() == 0) {
- isNDVBitVectorSet = false;
+ ndvEstimator = null;
break;
+ } else {
+ // check if all of the bit vectors can merge
+ NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(cso.getStatsData().getDecimalStats().getBitVectors());
+ if (ndvEstimator == null) {
+ ndvEstimator = estimator;
+ } else {
+ if (ndvEstimator.canMerge(estimator)) {
+ continue;
+ } else {
+ ndvEstimator = null;
+ break;
+ }
+ }
}
}
+ if (ndvEstimator != null) {
+ ndvEstimator = NumDistinctValueEstimatorFactory
+ .getEmptyNumDistinctValueEstimator(ndvEstimator);
+ }
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
if (doAllPartitionContainStats || css.size() < 2) {
DecimalColumnStatsData aggregateData = null;
long lowerBound = 0;
long higherBound = 0;
double densityAvgSum = 0.0;
- NumDistinctValueEstimator ndvEstimator = null;
- if (isNDVBitVectorSet) {
- ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
- }
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
@@ -85,9 +100,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils
.getDoubleValue(newData.getLowValue())) / newData.getNumDVs();
}
- if (isNDVBitVectorSet) {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ if (ndvEstimator != null) {
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
}
if (aggregateData == null) {
aggregateData = newData.deepCopy();
@@ -108,7 +123,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
}
}
- if (isNDVBitVectorSet) {
+ if (ndvEstimator != null) {
// if all the ColumnStatisticsObjs contain bitvectors, we do not need to
// use uniform distribution assumption because we can merge bitvectors
// to get a good estimation.
@@ -145,7 +160,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// while we scan the css, we also get the densityAvg, lowerbound and
// higerbound when useDensityFunctionForNDVEstimation is true.
double densityAvgSum = 0.0;
- if (!isNDVBitVectorSet) {
+ if (ndvEstimator == null) {
// if not every partition uses bitvector for ndv, we just fall back to
// the traditional extrapolation methods.
for (ColumnStatistics cs : css) {
@@ -162,7 +177,6 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
} else {
// we first merge all the adjacent bitvectors that we could merge and
// derive new partition names and index.
- NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
StringBuilder pseudoPartName = new StringBuilder();
double pseudoIndexSum = 0;
int length = 0;
@@ -191,6 +205,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
pseudoPartName = new StringBuilder();
pseudoIndexSum = 0;
length = 0;
+ ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
}
aggregateData = null;
}
@@ -216,8 +231,8 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
}
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
}
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
}
if (length > 0) {
// we have to set ndv
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java
index a88ef84e5c..b6b86123b2 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java
@@ -26,7 +26,8 @@
import java.util.List;
import java.util.Map;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
@@ -44,7 +45,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// check if all the ColumnStatisticsObjs contain stats and all the ndv are
// bitvectors
boolean doAllPartitionContainStats = partNames.size() == css.size();
- boolean isNDVBitVectorSet = true;
+ NumDistinctValueEstimator ndvEstimator = null;
String colType = null;
for (ColumnStatistics cs : css) {
if (cs.getStatsObjSize() != 1) {
@@ -58,22 +59,36 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso
.getStatsData().getSetField());
}
- if (numBitVectors <= 0 || !cso.getStatsData().getDoubleStats().isSetBitVectors()
+ if (!cso.getStatsData().getDoubleStats().isSetBitVectors()
|| cso.getStatsData().getDoubleStats().getBitVectors().length() == 0) {
- isNDVBitVectorSet = false;
+ ndvEstimator = null;
break;
+ } else {
+ // check if all of the bit vectors can merge
+ NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(cso.getStatsData().getDoubleStats().getBitVectors());
+ if (ndvEstimator == null) {
+ ndvEstimator = estimator;
+ } else {
+ if (ndvEstimator.canMerge(estimator)) {
+ continue;
+ } else {
+ ndvEstimator = null;
+ break;
+ }
+ }
}
}
+ if (ndvEstimator != null) {
+ ndvEstimator = NumDistinctValueEstimatorFactory
+ .getEmptyNumDistinctValueEstimator(ndvEstimator);
+ }
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
if (doAllPartitionContainStats || css.size() < 2) {
DoubleColumnStatsData aggregateData = null;
long lowerBound = 0;
long higherBound = 0;
double densityAvgSum = 0.0;
- NumDistinctValueEstimator ndvEstimator = null;
- if (isNDVBitVectorSet) {
- ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
- }
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
@@ -82,9 +97,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
higherBound += newData.getNumDVs();
densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
}
- if (isNDVBitVectorSet) {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ if (ndvEstimator != null) {
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
}
if (aggregateData == null) {
aggregateData = newData.deepCopy();
@@ -96,7 +111,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
}
}
- if (isNDVBitVectorSet) {
+ if (ndvEstimator != null) {
// if all the ColumnStatisticsObjs contain bitvectors, we do not need to
// use uniform distribution assumption because we can merge bitvectors
// to get a good estimation.
@@ -132,7 +147,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// while we scan the css, we also get the densityAvg, lowerbound and
// higerbound when useDensityFunctionForNDVEstimation is true.
double densityAvgSum = 0.0;
- if (!isNDVBitVectorSet) {
+ if (ndvEstimator == null) {
// if not every partition uses bitvector for ndv, we just fall back to
// the traditional extrapolation methods.
for (ColumnStatistics cs : css) {
@@ -148,7 +163,6 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
} else {
// we first merge all the adjacent bitvectors that we could merge and
// derive new partition names and index.
- NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
StringBuilder pseudoPartName = new StringBuilder();
double pseudoIndexSum = 0;
int length = 0;
@@ -176,6 +190,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
pseudoPartName = new StringBuilder();
pseudoIndexSum = 0;
length = 0;
+ ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
}
aggregateData = null;
}
@@ -192,8 +207,8 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
newData.getHighValue()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
}
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
}
if (length > 0) {
// we have to set ndv
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java
index 8ac6561aec..2da6f60167 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java
@@ -26,7 +26,8 @@
import java.util.List;
import java.util.Map;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
@@ -44,7 +45,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// check if all the ColumnStatisticsObjs contain stats and all the ndv are
// bitvectors
boolean doAllPartitionContainStats = partNames.size() == css.size();
- boolean isNDVBitVectorSet = true;
+ NumDistinctValueEstimator ndvEstimator = null;
String colType = null;
for (ColumnStatistics cs : css) {
if (cs.getStatsObjSize() != 1) {
@@ -58,22 +59,36 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso
.getStatsData().getSetField());
}
- if (numBitVectors <= 0 || !cso.getStatsData().getLongStats().isSetBitVectors()
+ if (!cso.getStatsData().getLongStats().isSetBitVectors()
|| cso.getStatsData().getLongStats().getBitVectors().length() == 0) {
- isNDVBitVectorSet = false;
+ ndvEstimator = null;
break;
+ } else {
+ // check if all of the bit vectors can merge
+ NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(cso.getStatsData().getLongStats().getBitVectors());
+ if (ndvEstimator == null) {
+ ndvEstimator = estimator;
+ } else {
+ if (ndvEstimator.canMerge(estimator)) {
+ continue;
+ } else {
+ ndvEstimator = null;
+ break;
+ }
+ }
}
}
+ if (ndvEstimator != null) {
+ ndvEstimator = NumDistinctValueEstimatorFactory
+ .getEmptyNumDistinctValueEstimator(ndvEstimator);
+ }
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
if (doAllPartitionContainStats || css.size() < 2) {
LongColumnStatsData aggregateData = null;
long lowerBound = 0;
long higherBound = 0;
double densityAvgSum = 0.0;
- NumDistinctValueEstimator ndvEstimator = null;
- if (isNDVBitVectorSet) {
- ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
- }
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
LongColumnStatsData newData = cso.getStatsData().getLongStats();
@@ -82,9 +97,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
higherBound += newData.getNumDVs();
densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
}
- if (isNDVBitVectorSet) {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ if (ndvEstimator != null) {
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
}
if (aggregateData == null) {
aggregateData = newData.deepCopy();
@@ -96,7 +111,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
}
}
- if (isNDVBitVectorSet) {
+ if (ndvEstimator != null) {
// if all the ColumnStatisticsObjs contain bitvectors, we do not need to
// use uniform distribution assumption because we can merge bitvectors
// to get a good estimation.
@@ -132,7 +147,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// while we scan the css, we also get the densityAvg, lowerbound and
// higerbound when useDensityFunctionForNDVEstimation is true.
double densityAvgSum = 0.0;
- if (!isNDVBitVectorSet) {
+ if (ndvEstimator == null) {
// if not every partition uses bitvector for ndv, we just fall back to
// the traditional extrapolation methods.
for (ColumnStatistics cs : css) {
@@ -148,7 +163,6 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
} else {
// we first merge all the adjacent bitvectors that we could merge and
// derive new partition names and index.
- NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
StringBuilder pseudoPartName = new StringBuilder();
double pseudoIndexSum = 0;
int length = 0;
@@ -176,6 +190,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
pseudoPartName = new StringBuilder();
pseudoIndexSum = 0;
length = 0;
+ ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
}
aggregateData = null;
}
@@ -192,8 +207,8 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
newData.getHighValue()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
}
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
}
if (length > 0) {
// we have to set ndv
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java
index 2aa4046a46..83c6c54fd2 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java
@@ -21,7 +21,8 @@
import java.util.List;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
@@ -39,7 +40,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
// bitvectors. Only when both of the conditions are true, we merge bit
// vectors. Otherwise, just use the maximum function.
boolean doAllPartitionContainStats = partNames.size() == css.size();
- boolean isNDVBitVectorSet = true;
+ NumDistinctValueEstimator ndvEstimator = null;
String colType = null;
for (ColumnStatistics cs : css) {
if (cs.getStatsObjSize() != 1) {
@@ -53,21 +54,37 @@ public ColumnStatisticsObj aggregate(String colName, List partNames,
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso
.getStatsData().getSetField());
}
- if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors()
+ if (!cso.getStatsData().getStringStats().isSetBitVectors()
|| cso.getStatsData().getStringStats().getBitVectors().length() == 0) {
- isNDVBitVectorSet = false;
+ ndvEstimator = null;
break;
+ } else {
+ // check if all of the bit vectors can merge
+ NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(cso.getStatsData().getStringStats().getBitVectors());
+ if (ndvEstimator == null) {
+ ndvEstimator = estimator;
+ } else {
+ if (ndvEstimator.canMerge(estimator)) {
+ continue;
+ } else {
+ ndvEstimator = null;
+ break;
+ }
+ }
}
}
+ if (ndvEstimator != null) {
+ ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
+ }
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
- if (doAllPartitionContainStats && isNDVBitVectorSet) {
+ if (doAllPartitionContainStats && ndvEstimator!=null) {
StringColumnStatsData aggregateData = null;
- NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
StringColumnStatsData newData = cso.getStatsData().getStringStats();
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
+ ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors()));
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java
index 33c7e3e52c..d3051a2b00 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java
@@ -19,7 +19,6 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -27,8 +26,6 @@
public abstract class ColumnStatsMerger {
protected final Logger LOG = LoggerFactory.getLogger(ColumnStatsMerger.class.getName());
- NumDistinctValueEstimator ndvEstimator = null;
-
public abstract void merge(ColumnStatisticsObj aggregateColStats,
ColumnStatisticsObj newColStats);
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java
index fe890e4e27..c013ba5c5d 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java
@@ -20,7 +20,8 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
+import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog;
import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;
import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
@@ -37,15 +38,6 @@
private ColumnStatsMergerFactory() {
}
- // we depend on the toString() method for javolution.util.FastCollection.
- private static int countNumBitVectors(String s) {
- if (s != null) {
- return StringUtils.countMatches(s, "{");
- } else {
- return 0;
- }
- }
-
public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsObjNew,
ColumnStatisticsObj statsObjOld) {
ColumnStatsMerger agg;
@@ -53,30 +45,20 @@ public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsOb
_Fields typeOld = statsObjOld.getStatsData().getSetField();
// make sure that they have the same type
typeNew = typeNew == typeOld ? typeNew : null;
- int numBitVectors = 0;
switch (typeNew) {
case BOOLEAN_STATS:
agg = new BooleanColumnStatsMerger();
break;
case LONG_STATS: {
agg = new LongColumnStatsMerger();
- int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getLongStats().getBitVectors());
- int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getLongStats().getBitVectors());
- numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case DOUBLE_STATS: {
agg = new DoubleColumnStatsMerger();
- int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDoubleStats().getBitVectors());
- int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDoubleStats().getBitVectors());
- numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case STRING_STATS: {
agg = new StringColumnStatsMerger();
- int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getStringStats().getBitVectors());
- int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getStringStats().getBitVectors());
- numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case BINARY_STATS:
@@ -84,24 +66,15 @@ public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsOb
break;
case DECIMAL_STATS: {
agg = new DecimalColumnStatsMerger();
- int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDecimalStats().getBitVectors());
- int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDecimalStats().getBitVectors());
- numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case DATE_STATS: {
agg = new DateColumnStatsMerger();
- int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDateStats().getBitVectors());
- int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDateStats().getBitVectors());
- numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
default:
throw new IllegalArgumentException("Unknown stats type " + typeNew.toString());
}
- if (numBitVectors > 0) {
- agg.ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
- }
return agg;
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java
index 3179b23438..e899bfe85f 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java
@@ -19,7 +19,8 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Date;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
@@ -29,27 +30,32 @@
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
DateColumnStatsData aggregateData = aggregateColStats.getStatsData().getDateStats();
DateColumnStatsData newData = newColStats.getStatsData().getDateStats();
- Date lowValue =
- aggregateData.getLowValue().compareTo(newData.getLowValue()) < 0 ? aggregateData
- .getLowValue() : newData.getLowValue();
+ Date lowValue = aggregateData.getLowValue().compareTo(newData.getLowValue()) < 0 ? aggregateData
+ .getLowValue() : newData.getLowValue();
aggregateData.setLowValue(lowValue);
- Date highValue =
- aggregateData.getHighValue().compareTo(newData.getHighValue()) >= 0 ? aggregateData
- .getHighValue() : newData.getHighValue();
+ Date highValue = aggregateData.getHighValue().compareTo(newData.getHighValue()) >= 0 ? aggregateData
+ .getHighValue() : newData.getHighValue();
aggregateData.setHighValue(highValue);
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
- if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
+ if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0
+ || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
} else {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- long ndv = ndvEstimator.estimateNumDistinctValues();
+ NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(aggregateData.getBitVectors());
+ NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors());
+ long ndv = -1;
+ if (oldEst.canMerge(newEst)) {
+ oldEst.mergeEstimators(newEst);
+ ndv = oldEst.estimateNumDistinctValues();
+ aggregateData.setBitVectors(oldEst.serialize());
+ } else {
+ ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
+ }
LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of "
+ aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
aggregateData.setNumDVs(ndv);
- aggregateData.setBitVectors(ndvEstimator.serialize().toString());
}
}
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java
index c13add9d9c..4099ffcace 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java
@@ -19,7 +19,8 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
@@ -38,18 +39,25 @@ public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj new
.getHighValue() : newData.getHighValue();
aggregateData.setHighValue(highValue);
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
- if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
+ if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0
+ || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
} else {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- long ndv = ndvEstimator.estimateNumDistinctValues();
+ NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(aggregateData.getBitVectors());
+ NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors());
+ long ndv = -1;
+ if (oldEst.canMerge(newEst)) {
+ oldEst.mergeEstimators(newEst);
+ ndv = oldEst.estimateNumDistinctValues();
+ aggregateData.setBitVectors(oldEst.serialize());
+ } else {
+ ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
+ }
LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of "
+ aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
aggregateData.setNumDVs(ndv);
- aggregateData.setBitVectors(ndvEstimator.serialize().toString());
}
}
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java
index fbdba24b0a..1691fc97df 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java
@@ -19,7 +19,8 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
@@ -31,18 +32,25 @@ public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj new
aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
- if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
+ if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0
+ || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
} else {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- long ndv = ndvEstimator.estimateNumDistinctValues();
+ NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(aggregateData.getBitVectors());
+ NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors());
+ long ndv = -1;
+ if (oldEst.canMerge(newEst)) {
+ oldEst.mergeEstimators(newEst);
+ ndv = oldEst.estimateNumDistinctValues();
+ aggregateData.setBitVectors(oldEst.serialize());
+ } else {
+ ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
+ }
LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of "
+ aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
aggregateData.setNumDVs(ndv);
- aggregateData.setBitVectors(ndvEstimator.serialize().toString());
}
}
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java
index ac65590505..361af350fe 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java
@@ -19,7 +19,8 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
@@ -31,18 +32,25 @@ public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj new
aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
- if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
+ if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0
+ || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
} else {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- long ndv = ndvEstimator.estimateNumDistinctValues();
+ NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(aggregateData.getBitVectors());
+ NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors());
+ long ndv = -1;
+ if (oldEst.canMerge(newEst)) {
+ oldEst.mergeEstimators(newEst);
+ ndv = oldEst.estimateNumDistinctValues();
+ aggregateData.setBitVectors(oldEst.serialize());
+ } else {
+ ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
+ }
LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of "
+ aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
aggregateData.setNumDVs(ndv);
- aggregateData.setBitVectors(ndvEstimator.serialize().toString());
}
}
}
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java
index 41587477d3..8e28f907ee 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java
@@ -19,10 +19,10 @@
package org.apache.hadoop.hive.metastore.hbase.stats.merge;
-import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
-import org.apache.parquet.Log;
public class StringColumnStatsMerger extends ColumnStatsMerger {
@Override
@@ -32,18 +32,25 @@ public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj new
aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
- if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
+ if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0
+ || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
} else {
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
- ndvEstimator.getnumBitVectors()));
- long ndv = ndvEstimator.estimateNumDistinctValues();
+ NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(aggregateData.getBitVectors());
+ NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(newData.getBitVectors());
+ long ndv = -1;
+ if (oldEst.canMerge(newEst)) {
+ oldEst.mergeEstimators(newEst);
+ ndv = oldEst.estimateNumDistinctValues();
+ aggregateData.setBitVectors(oldEst.serialize());
+ } else {
+ ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
+ }
LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of "
+ aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
aggregateData.setNumDVs(ndv);
- aggregateData.setBitVectors(ndvEstimator.serialize().toString());
}
}
}
diff --git a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java
index 87b1ac870d..74e16695a9 100644
--- a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java
+++ b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java
@@ -60,7 +60,7 @@
private HBaseStore store;
SortedMap rows = new TreeMap<>();
- // NDV will be 3 for bitVectors[0] and 12 for bitVectors[1]
+ // NDV will be 3 for bitVectors[0] and 1 for bitVectors[1]
String bitVectors[] = {
"{0, 4, 5, 7}{0, 1}{0, 1, 2}{0, 1, 4}{0}{0, 2}{0, 3}{0, 2, 3, 4}{0, 1, 4}{0, 1}{0}{0, 1, 3, 8}{0, 2}{0, 2}{0, 9}{0, 1, 4}",
"{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}" };
@@ -278,7 +278,7 @@ public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(1010, lcsd.getHighValue(), 0.01);
Assert.assertEquals(-1010, lcsd.getLowValue(), 0.01);
Assert.assertEquals(45, lcsd.getNumNulls());
- Assert.assertEquals(12, lcsd.getNumDVs());
+ Assert.assertEquals(3, lcsd.getNumDVs());
}
};
List partNames = new ArrayList<>();
@@ -422,7 +422,7 @@ public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(1010, lcsd.getHighValue(), 0.01);
Assert.assertEquals(-1010, lcsd.getLowValue(), 0.01);
Assert.assertEquals(40, lcsd.getNumNulls());
- Assert.assertEquals(12, lcsd.getNumDVs());
+ Assert.assertEquals(3, lcsd.getNumDVs());
}
};
List partNames = new ArrayList<>();
@@ -494,7 +494,7 @@ public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(1010, HBaseUtils.getDoubleValue(lcsd.getHighValue()), 0.01);
Assert.assertEquals(-1010, HBaseUtils.getDoubleValue(lcsd.getLowValue()), 0.01);
Assert.assertEquals(40, lcsd.getNumNulls());
- Assert.assertEquals(12, lcsd.getNumDVs());
+ Assert.assertEquals(3, lcsd.getNumDVs());
}
};
List partNames = new ArrayList<>();
@@ -566,7 +566,7 @@ public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(1010, lcsd.getHighValue(), 0.01);
Assert.assertEquals(-1010, lcsd.getLowValue(), 0.01);
Assert.assertEquals(40, lcsd.getNumNulls());
- Assert.assertEquals(12, lcsd.getNumDVs());
+ Assert.assertEquals(3, lcsd.getNumDVs());
}
};
List partNames = new ArrayList<>();
diff --git a/ql/pom.xml b/ql/pom.xml
index 5732965e47..e17fe50b94 100644
--- a/ql/pom.xml
+++ b/ql/pom.xml
@@ -364,6 +364,11 @@
${datanucleus-core.version}
+ javolution
+ javolution
+ ${javolution.version}
+
+
org.apache.calcite
calcite-core
${calcite.version}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
index 0a5cf00c44..1923a9b516 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
@@ -27,6 +27,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.HiveStatsUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.conf.HiveVariableSource;
import org.apache.hadoop.hive.conf.VariableSubstitution;
@@ -37,8 +38,6 @@
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
-import org.apache.hadoop.hive.ql.session.OperationLog;
-import org.apache.hadoop.hive.ql.session.OperationLog.LoggingLevel;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde.serdeConstants;
@@ -246,7 +245,7 @@ private String escapeBackTicks(String colName) {
return colName.replaceAll("`", "``");
}
- private String genRewrittenQuery(List colNames, int numBitVectors, Map partSpec,
+ private String genRewrittenQuery(List colNames, HiveConf conf, Map partSpec,
boolean isPartitionStats) throws SemanticException{
StringBuilder rewrittenQueryBuilder = new StringBuilder("select ");
String rewrittenQuery;
@@ -255,11 +254,20 @@ private String genRewrittenQuery(List colNames, int numBitVectors, Map 0) {
rewrittenQueryBuilder.append(" , ");
}
+ String func = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ALGO).toLowerCase();
rewrittenQueryBuilder.append("compute_stats(`");
rewrittenQueryBuilder.append(escapeBackTicks(colNames.get(i)));
- rewrittenQueryBuilder.append("` , ");
- rewrittenQueryBuilder.append(numBitVectors);
- rewrittenQueryBuilder.append(" )");
+ rewrittenQueryBuilder.append("`, '" + func + "'");
+ if (func.equals("fm")) {
+ int numBitVectors = 0;
+ try {
+ numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
+ } catch (Exception e) {
+ throw new SemanticException(e.getMessage());
+ }
+ rewrittenQueryBuilder.append(", " + numBitVectors);
+ }
+ rewrittenQueryBuilder.append(")");
}
if (isPartitionStats) {
@@ -377,13 +385,7 @@ public void analyze(ASTNode ast, Context origCtx) throws SemanticException {
isTableLevel = true;
}
colType = getColumnTypes(colNames);
- int numBitVectors;
- try {
- numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
- } catch (Exception e) {
- throw new SemanticException(e.getMessage());
- }
- rewrittenQuery = genRewrittenQuery(colNames, numBitVectors, partSpec, isPartitionStats);
+ rewrittenQuery = genRewrittenQuery(colNames, conf, partSpec, isPartitionStats);
rewrittenTree = genRewrittenTree(rewrittenQuery);
} else {
// Not an analyze table column compute statistics statement - don't do any rewrites
@@ -447,13 +449,7 @@ public ASTNode rewriteAST(ASTNode ast, ColumnStatsAutoGatherContext context)
isTableLevel = true;
}
colType = getColumnTypes(colNames);
- int numBitVectors = 0;
- try {
- numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
- } catch (Exception e) {
- throw new SemanticException(e.getMessage());
- }
- rewrittenQuery = genRewrittenQuery(colNames, numBitVectors, partSpec, isPartitionStats);
+ rewrittenQuery = genRewrittenQuery(colNames, conf, partSpec, isPartitionStats);
rewrittenTree = genRewrittenTree(rewrittenQuery);
context.analyzeRewrite = new AnalyzeRewriteContext();
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 76f7daeb1b..3b9ab41bed 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1640,60 +1640,6 @@ public static long safeMult(long a, long b) {
}
}
- public static int getNumBitVectorsForNDVEstimation(HiveConf conf) throws SemanticException {
- int numBitVectors;
- float percentageError = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ERROR);
-
- if (percentageError < 0.0) {
- throw new SemanticException("hive.stats.ndv.error can't be negative");
- } else if (percentageError <= 2.4) {
- numBitVectors = 1024;
- LOG.info("Lowest error achievable is 2.4% but error requested is " + percentageError + "%");
- LOG.info("Choosing 1024 bit vectors..");
- } else if (percentageError <= 3.4 ) {
- numBitVectors = 1024;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 1024 bit vectors..");
- } else if (percentageError <= 4.8) {
- numBitVectors = 512;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 512 bit vectors..");
- } else if (percentageError <= 6.8) {
- numBitVectors = 256;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 256 bit vectors..");
- } else if (percentageError <= 9.7) {
- numBitVectors = 128;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 128 bit vectors..");
- } else if (percentageError <= 13.8) {
- numBitVectors = 64;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 64 bit vectors..");
- } else if (percentageError <= 19.6) {
- numBitVectors = 32;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 32 bit vectors..");
- } else if (percentageError <= 28.2) {
- numBitVectors = 16;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 16 bit vectors..");
- } else if (percentageError <= 40.9) {
- numBitVectors = 8;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 8 bit vectors..");
- } else if (percentageError <= 61.0) {
- numBitVectors = 4;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 4 bit vectors..");
- } else {
- numBitVectors = 2;
- LOG.info("Error requested is " + percentageError + "%");
- LOG.info("Choosing 2 bit vectors..");
- }
- return numBitVectors;
- }
-
public static boolean hasDiscreteRange(ColStatistics colStat) {
if (colStat.getRange() != null) {
TypeInfo colType = TypeInfoUtils.getTypeInfoFromTypeString(colStat.getColumnType());
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java
deleted file mode 100644
index e76fc74dbc..0000000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.udf.generic;
-
-public class DoubleNumDistinctValueEstimator extends NumDistinctValueEstimator {
-
- public DoubleNumDistinctValueEstimator(int numBitVectors) {
- super(numBitVectors);
- }
-
- public DoubleNumDistinctValueEstimator(String s, int numVectors) {
- super(s, numVectors);
- }
-
- public void addToEstimator(double d) {
- int v = new Double(d).hashCode();
- super.addToEstimator(v);
- }
-
- public void addToEstimatorPCSA(double d) {
- int v = new Double(d).hashCode();
- super.addToEstimatorPCSA(v);
- }
-}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java
index 2ebfcb2360..2d56950cb1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java
@@ -22,6 +22,11 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.hadoop.hive.common.classification.InterfaceAudience;
+import org.apache.hadoop.hive.common.ndv.FMSketch;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
+import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory;
+import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
@@ -53,13 +58,13 @@
public class GenericUDAFComputeStats extends AbstractGenericUDAFResolver {
static final Logger LOG = LoggerFactory.getLogger(GenericUDAFComputeStats.class.getName());
-
+
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
throws SemanticException {
- if (parameters.length != 2 ) {
+ if (parameters.length < 2 ) {
throw new UDFArgumentTypeException(parameters.length - 1,
- "Exactly two arguments are expected.");
+ "Exactly 2 (col + hll) or 3 (col + fm + #bitvectors) arguments are expected.");
}
if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
@@ -235,23 +240,12 @@ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveExcep
if (!emptyTable) {
if (p == null) {
myagg.countNulls++;
- }
- else {
- try {
- boolean v = PrimitiveObjectInspectorUtils.getBoolean(p, inputOI);
- if (v == false) {
- myagg.countFalses++;
- } else if (v == true){
- myagg.countTrues++;
- }
- } catch (NumberFormatException e) {
- if (!warned) {
- warned = true;
- LOG.warn(getClass().getSimpleName() + " "
- + StringUtils.stringifyException(e));
- LOG.warn(getClass().getSimpleName()
- + " ignoring similar exceptions.");
- }
+ } else {
+ boolean v = PrimitiveObjectInspectorUtils.getBoolean(p, inputOI);
+ if (v == false) {
+ myagg.countFalses++;
+ } else if (v == true) {
+ myagg.countTrues++;
}
}
}
@@ -302,6 +296,7 @@ public Object terminate(AggregationBuffer agg) throws HiveException {
/* Object Inspector corresponding to the input parameter.
*/
protected transient PrimitiveObjectInspector inputOI;
+ protected transient PrimitiveObjectInspector funcOI;
protected transient PrimitiveObjectInspector numVectorsOI;
@@ -322,9 +317,6 @@ public Object terminate(AggregationBuffer agg) throws HiveException {
protected transient StructField ndvField;
protected transient StringObjectInspector ndvFieldOI;
- protected transient StructField numBitVectorsField;
- protected transient IntObjectInspector numBitVectorsFieldOI;
-
/* Partial aggregation result returned by TerminatePartial. Partial result is a struct
* containing a long field named "count".
*/
@@ -334,8 +326,6 @@ public Object terminate(AggregationBuffer agg) throws HiveException {
*/
protected transient Object[] result;
- protected transient boolean warned;
-
protected abstract OI getValueObjectInspector();
protected abstract OI getValueObjectInspector(PrimitiveTypeInfo typeInfo);
@@ -347,7 +337,10 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
// initialize input
if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
inputOI = (PrimitiveObjectInspector) parameters[0];
- numVectorsOI = (PrimitiveObjectInspector) parameters[1];
+ funcOI = (PrimitiveObjectInspector) parameters[1];
+ if (parameters.length > 2) {
+ numVectorsOI = (PrimitiveObjectInspector) parameters[2];
+ }
} else {
soi = (StructObjectInspector) parameters[0];
@@ -363,9 +356,6 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
ndvField = soi.getStructFieldRef("bitvector");
ndvFieldOI = (StringObjectInspector) ndvField.getFieldObjectInspector();
- numBitVectorsField = soi.getStructFieldRef("numbitvectors");
- numBitVectorsFieldOI = (IntObjectInspector)
- numBitVectorsField.getFieldObjectInspector();
}
// initialize output
@@ -376,7 +366,6 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
foi.add(getValueObjectInspector(inputOI.getTypeInfo()));
foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
- foi.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector);
List fname = new ArrayList();
fname.add("columnType");
@@ -384,13 +373,11 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
fname.add("max");
fname.add("countnulls");
fname.add("bitvector");
- fname.add("numbitvectors");
partialResult = new Object[6];
partialResult[0] = new Text();
partialResult[3] = new LongWritable(0);
partialResult[4] = new Text();
- partialResult[5] = new IntWritable(0);
return ObjectInspectorFactory.getStandardStructObjectInspector(fname,
foi);
@@ -436,12 +423,13 @@ public int estimate() {
return (int) (model.lengthFor(columnType)
+ model.primitive1()
+ model.primitive2()
- + ((numDV == null) ? NumDistinctValueEstimator.lengthFor(model, null) :
+ + ((numDV == null) ? lengthFor(model, null) :
numDV.lengthFor(model)));
}
- protected void initNDVEstimator(int numBitVectors) {
- numDV = new NumDistinctValueEstimator(numBitVectors);
+ protected void initNDVEstimator(String func, int numBitVectors) {
+ numDV = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(func,
+ numBitVectors);
}
protected abstract void update(Object p, PrimitiveObjectInspector inputOI);
@@ -457,7 +445,6 @@ protected Object serialize(Object[] result) {
if (numDV != null) {
((Text) result[5]).set(numDV.serialize());
}
-
return result;
}
@@ -465,11 +452,10 @@ protected Object serializePartial(Object[] result) {
// Serialize the rest of the values in the AggBuffer
serializeCommon(result);
- // Serialize numDistinctValue Estimator
- Text t = numDV.serialize();
- ((Text) result[4]).set(t);
- ((IntWritable) result[5]).set(numDV.getnumBitVectors());
-
+ if (numDV != null) {
+ // Serialize numDistinctValue Estimator
+ ((Text) result[4]).set(numDV.serialize());
+ }
return result;
}
@@ -495,30 +481,29 @@ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveExcep
NumericStatsAgg myagg = (NumericStatsAgg) agg;
if (myagg.numDV == null) {
- int numVectors = parameters[1] == null ? 0 :
- PrimitiveObjectInspectorUtils.getInt(parameters[1], numVectorsOI);
- if (numVectors > MAX_BIT_VECTORS) {
- throw new HiveException("The maximum allowed value for number of bit vectors " +
- " is " + MAX_BIT_VECTORS + ", but was passed " + numVectors + " bit vectors");
+ String func = null;
+ int numVectors = 0;
+ // func may be null when GBY op is closing.
+ // see mvn test -Dtest=TestMiniTezCliDriver -Dqfile=explainuser_3.q
+ // original behavior is to create FMSketch
+ func = parameters[1] == null ? "fm" : PrimitiveObjectInspectorUtils.getString(
+ parameters[1], funcOI);
+ if (parameters.length == 3) {
+ numVectors = parameters[2] == null ? 0 : PrimitiveObjectInspectorUtils.getInt(
+ parameters[2], numVectorsOI);
+ if (numVectors > MAX_BIT_VECTORS) {
+ throw new HiveException("The maximum allowed value for number of bit vectors " + " is "
+ + MAX_BIT_VECTORS + ", but was passed " + numVectors + " bit vectors");
+ }
}
- myagg.initNDVEstimator(numVectors);
+ myagg.initNDVEstimator(func, numVectors);
}
- //Update null counter if a null value is seen
+ // Update null counter if a null value is seen
if (parameters[0] == null) {
myagg.countNulls++;
} else {
- try {
- myagg.update(parameters[0], inputOI);
- } catch (NumberFormatException e) {
- if (!warned) {
- warned = true;
- LOG.warn(getClass().getSimpleName() + " "
- + StringUtils.stringifyException(e));
- LOG.warn(getClass().getSimpleName()
- + " ignoring similar exceptions.");
- }
- }
+ myagg.update(parameters[0], inputOI);
}
}
@@ -537,15 +522,6 @@ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
if (partial != null) {
NumericStatsAgg myagg = (NumericStatsAgg) agg;
- if (myagg.numDV == null) {
- Object partialValue = soi.getStructFieldData(partial, numBitVectorsField);
- int numVectors = numBitVectorsFieldOI.get(partialValue);
- if (numVectors <= 0) {
- return;
- }
- myagg.initNDVEstimator(numVectors);
- }
-
// Update min if min is lesser than the smallest value seen so far
Object minValue = soi.getStructFieldData(partial, minField);
myagg.updateMin(minValue, minFieldOI);
@@ -561,9 +537,15 @@ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
// Merge numDistinctValue Estimators
Object numDistinct = soi.getStructFieldData(partial, ndvField);
String v = ndvFieldOI.getPrimitiveJavaObject(numDistinct);
- NumDistinctValueEstimator o =
- new NumDistinctValueEstimator(v, myagg.numDV.getnumBitVectors());
- myagg.numDV.mergeEstimators(o);
+
+ if (v != null && v.length() != 0) {
+ if (myagg.numDV == null) {
+ myagg.numDV = NumDistinctValueEstimatorFactory.getNumDistinctValueEstimator(v);
+ } else {
+ myagg.numDV.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(v));
+ }
+ }
}
}
}
@@ -713,6 +695,7 @@ public void reset(AggregationBuffer agg) throws HiveException {
/* Object Inspector corresponding to the input parameter.
*/
private transient PrimitiveObjectInspector inputOI;
+ private transient PrimitiveObjectInspector funcOI;
private transient PrimitiveObjectInspector numVectorsOI;
private final static int MAX_BIT_VECTORS = 1024;
@@ -741,9 +724,6 @@ public void reset(AggregationBuffer agg) throws HiveException {
private transient StructField ndvField;
private transient StringObjectInspector ndvFieldOI;
- private transient StructField numBitVectorsField;
- private transient IntObjectInspector numBitVectorsFieldOI;
-
/* Output of final result of the aggregation
*/
private transient Object[] result;
@@ -755,7 +735,10 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
// initialize input
if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
inputOI = (PrimitiveObjectInspector) parameters[0];
- numVectorsOI = (PrimitiveObjectInspector) parameters[1];
+ funcOI = (PrimitiveObjectInspector) parameters[1];
+ if (parameters.length > 2) {
+ numVectorsOI = (PrimitiveObjectInspector) parameters[2];
+ }
} else {
soi = (StructObjectInspector) parameters[0];
@@ -774,9 +757,6 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
ndvField = soi.getStructFieldRef("bitvector");
ndvFieldOI = (StringObjectInspector) ndvField.getFieldObjectInspector();
- numBitVectorsField = soi.getStructFieldRef("numbitvectors");
- numBitVectorsFieldOI = (IntObjectInspector)
- numBitVectorsField.getFieldObjectInspector();
}
// initialize output
@@ -788,7 +768,6 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
foi.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
- foi.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector);
List fname = new ArrayList();
fname.add("columntype");
@@ -797,7 +776,6 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
fname.add("count");
fname.add("countnulls");
fname.add("bitvector");
- fname.add("numbitvectors");
partialResult = new Object[7];
partialResult[0] = new Text();
@@ -806,7 +784,6 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
partialResult[3] = new LongWritable(0);
partialResult[4] = new LongWritable(0);
partialResult[5] = new Text();
- partialResult[6] = new IntWritable(0);
return ObjectInspectorFactory.getStandardStructObjectInspector(fname,
foi);
@@ -847,15 +824,14 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc
public long sumLength; /* Sum of lengths of all values seen so far */
public long count; /* Count of all values seen so far */
public long countNulls; /* Count of number of null values seen so far */
- public StringNumDistinctValueEstimator numDV; /* Distinct value estimator */
- public int numBitVectors;
+ public NumDistinctValueEstimator numDV; /* Distinct value estimator */
public boolean firstItem;
@Override
public int estimate() {
JavaDataModel model = JavaDataModel.get();
return (int) (model.primitive1() * 2 + model.primitive2() * 4 +
model.lengthFor(columnType) +
- ((numDV == null) ? NumDistinctValueEstimator.lengthFor(model, null) :
+ ((numDV == null) ? lengthFor(model, null) :
numDV.lengthFor(model)));
}
@@ -868,8 +844,9 @@ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
return result;
}
- public void initNDVEstimator(StringStatsAgg aggBuffer, int numBitVectors) {
- aggBuffer.numDV = new StringNumDistinctValueEstimator(numBitVectors);
+ public void initNDVEstimator(StringStatsAgg aggBuffer, String func, int numBitVectors) {
+ aggBuffer.numDV = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(func,
+ numBitVectors);
aggBuffer.numDV.reset();
}
@@ -890,83 +867,59 @@ public void reset(AggregationBuffer agg) throws HiveException {
public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
Object p = parameters[0];
StringStatsAgg myagg = (StringStatsAgg) agg;
- boolean emptyTable = false;
-
- if (parameters[1] == null) {
- emptyTable = true;
- }
if (myagg.firstItem) {
int numVectors = 0;
- if (!emptyTable) {
- numVectors = PrimitiveObjectInspectorUtils.getInt(parameters[1], numVectorsOI);
- }
-
- if (numVectors > MAX_BIT_VECTORS) {
- throw new HiveException("The maximum allowed value for number of bit vectors " +
- " is " + MAX_BIT_VECTORS + " , but was passed " + numVectors + " bit vectors");
+ String func = parameters[1] == null ? "fm" : PrimitiveObjectInspectorUtils.getString(
+ parameters[1], funcOI);
+ if (parameters.length > 2) {
+ numVectors = PrimitiveObjectInspectorUtils.getInt(parameters[2], numVectorsOI);
+ if (numVectors > MAX_BIT_VECTORS) {
+ throw new HiveException("The maximum allowed value for number of bit vectors " + " is "
+ + MAX_BIT_VECTORS + " , but was passed " + numVectors + " bit vectors");
+ }
}
- initNDVEstimator(myagg, numVectors);
+ initNDVEstimator(myagg, func, numVectors);
myagg.firstItem = false;
- myagg.numBitVectors = numVectors;
}
- if (!emptyTable) {
-
- // Update null counter if a null value is seen
- if (p == null) {
- myagg.countNulls++;
- }
- else {
- try {
-
- String v = PrimitiveObjectInspectorUtils.getString(p, inputOI);
-
- // Update max length if new length is greater than the ones seen so far
- int len = v.length();
- if (len > myagg.maxLength) {
- myagg.maxLength = len;
- }
-
- // Update sum length with the new length
- myagg.sumLength += len;
-
- // Increment count of values seen so far
- myagg.count++;
-
- // Add string value to NumDistinctValue Estimator
- myagg.numDV.addToEstimator(v);
-
- } catch (NumberFormatException e) {
- if (!warned) {
- warned = true;
- LOG.warn(getClass().getSimpleName() + " "
- + StringUtils.stringifyException(e));
- LOG.warn(getClass().getSimpleName()
- + " ignoring similar exceptions.");
- }
- }
+ // Update null counter if a null value is seen
+ String v = PrimitiveObjectInspectorUtils.getString(p, inputOI);
+ if (v == null) {
+ myagg.countNulls++;
+ } else {
+ // Update max length if new length is greater than the ones seen so
+ // far
+ int len = v.length();
+ if (len > myagg.maxLength) {
+ myagg.maxLength = len;
}
+
+ // Update sum length with the new length
+ myagg.sumLength += len;
+
+ // Increment count of values seen so far
+ myagg.count++;
+
+ // Add string value to NumDistinctValue Estimator
+ myagg.numDV.addToEstimator(v);
}
}
@Override
public Object terminatePartial(AggregationBuffer agg) throws HiveException {
StringStatsAgg myagg = (StringStatsAgg) agg;
-
- // Serialize numDistinctValue Estimator
- Text t = myagg.numDV.serialize();
-
// Serialize the rest of the values in the AggBuffer
((Text) partialResult[0]).set(myagg.columnType);
((LongWritable) partialResult[1]).set(myagg.maxLength);
((LongWritable) partialResult[2]).set(myagg.sumLength);
((LongWritable) partialResult[3]).set(myagg.count);
((LongWritable) partialResult[4]).set(myagg.countNulls);
- ((Text) partialResult[5]).set(t);
- ((IntWritable) partialResult[6]).set(myagg.numBitVectors);
-
+ // Serialize numDistinctValue Estimator
+ if (myagg.numDV != null) {
+ ((Text) partialResult[5]).set(myagg.numDV.serialize());
+ }
return partialResult;
}
@@ -975,17 +928,6 @@ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
if (partial != null) {
StringStatsAgg myagg = (StringStatsAgg) agg;
- if (myagg.firstItem) {
- Object partialValue = soi.getStructFieldData(partial, numBitVectorsField);
- int numVectors = numBitVectorsFieldOI.get(partialValue);
- if (numVectors <= 0) {
- return;
- }
- initNDVEstimator(myagg, numVectors);
- myagg.firstItem = false;
- myagg.numBitVectors = numVectors;
- }
-
// Update maxLength if length is greater than the largest value seen so far
Object partialValue = soi.getStructFieldData(partial, maxLengthField);
if (myagg.maxLength < maxLengthFieldOI.get(partialValue)) {
@@ -1007,8 +949,15 @@ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
// Merge numDistinctValue Estimators
partialValue = soi.getStructFieldData(partial, ndvField);
String v = ndvFieldOI.getPrimitiveJavaObject(partialValue);
- NumDistinctValueEstimator o = new NumDistinctValueEstimator(v, myagg.numBitVectors);
- myagg.numDV.mergeEstimators(o);
+
+ if (v != null && v.length() != 0) {
+ if (myagg.numDV == null) {
+ myagg.numDV = NumDistinctValueEstimatorFactory.getNumDistinctValueEstimator(v);
+ } else {
+ myagg.numDV.mergeEstimators(NumDistinctValueEstimatorFactory
+ .getNumDistinctValueEstimator(v));
+ }
+ }
}
}
@@ -1016,16 +965,12 @@ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
public Object terminate(AggregationBuffer agg) throws HiveException {
StringStatsAgg myagg = (StringStatsAgg) agg;
- long numDV = 0;
+ long numDV = myagg.numDV == null ? 0 : myagg.numDV.estimateNumDistinctValues();
double avgLength = 0.0;
long total = myagg.count + myagg.countNulls;
- if (myagg.numBitVectors != 0) {
- numDV = myagg.numDV.estimateNumDistinctValues();
- }
-
if (total != 0) {
- avgLength = myagg.sumLength / (1.0 * total);
+ avgLength = myagg.sumLength / (1.0 * total);
}
// Serialize the result struct
@@ -1034,7 +979,7 @@ public Object terminate(AggregationBuffer agg) throws HiveException {
((DoubleWritable) result[2]).set(avgLength);
((LongWritable) result[3]).set(myagg.countNulls);
((LongWritable) result[4]).set(numDV);
- if (myagg.numBitVectors != 0) {
+ if (myagg.numDV != null) {
((Text) result[5]).set(myagg.numDV.serialize());
}
return result;
@@ -1181,8 +1126,6 @@ public void reset(AggregationBuffer agg) throws HiveException {
myagg.countNulls = 0;
}
- boolean warned = false;
-
@Override
public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
Object p = parameters[0];
@@ -1197,32 +1140,21 @@ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveExcep
// Update null counter if a null value is seen
if (p == null) {
myagg.countNulls++;
- }
- else {
- try {
- BytesWritable v = PrimitiveObjectInspectorUtils.getBinary(p, inputOI);
-
- // Update max length if new length is greater than the ones seen so far
- int len = v.getLength();
- if (len > myagg.maxLength) {
- myagg.maxLength = len;
- }
-
- // Update sum length with the new length
- myagg.sumLength += len;
-
- // Increment count of values seen so far
- myagg.count++;
-
- } catch (NumberFormatException e) {
- if (!warned) {
- warned = true;
- LOG.warn(getClass().getSimpleName() + " "
- + StringUtils.stringifyException(e));
- LOG.warn(getClass().getSimpleName()
- + " ignoring similar exceptions.");
- }
+ } else {
+ BytesWritable v = PrimitiveObjectInspectorUtils.getBinary(p, inputOI);
+
+ // Update max length if new length is greater than the ones seen so
+ // far
+ int len = v.getLength();
+ if (len > myagg.maxLength) {
+ myagg.maxLength = len;
}
+
+ // Update sum length with the new length
+ myagg.sumLength += len;
+
+ // Increment count of values seen so far
+ myagg.count++;
}
}
}
@@ -1425,4 +1357,25 @@ public void reset(AggregationBuffer agg) throws HiveException {
((NumericStatsAgg)agg).reset("Date");
}
}
+
+ @InterfaceAudience.LimitedPrivate(value = { "Hive" })
+ static int lengthFor(JavaDataModel model, Integer numVector) {
+ int length = model.object();
+ length += model.primitive1() * 2; // two int
+ length += model.primitive2(); // one double
+ length += model.lengthForRandom() * 2; // two Random
+
+ if (numVector == null) {
+ numVector = 16; // HiveConf hive.stats.ndv.error default produces 16
+ // vectors
+ }
+
+ if (numVector > 0) {
+ length += model.array() * 3; // three array
+ length += model.primitive1() * numVector * 2; // two int array
+ length += (model.object() + model.array() + model.primitive1() + model.primitive2())
+ * numVector; // bitset array
+ }
+ return length;
+ }
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/LongNumDistinctValueEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/LongNumDistinctValueEstimator.java
deleted file mode 100644
index 1c197a028a..0000000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/LongNumDistinctValueEstimator.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.udf.generic;
-
-public class LongNumDistinctValueEstimator extends NumDistinctValueEstimator {
-
- public LongNumDistinctValueEstimator(int numBitVectors) {
- super(numBitVectors);
- }
-
- public LongNumDistinctValueEstimator(String s, int numVectors) {
- super(s, numVectors);
- }
-
- @Override
- public void addToEstimator(long v) {
- /* Update summary bitVector :
- * Generate hash value of the long value and mod it by 2^bitVectorSize-1.
- * In this implementation bitVectorSize is 31.
- */
- super.addToEstimator(v);
- }
-
- @Override
- public void addToEstimatorPCSA(long v) {
- super.addToEstimatorPCSA(v);
- }
-}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/StringNumDistinctValueEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/StringNumDistinctValueEstimator.java
deleted file mode 100644
index 601901c163..0000000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/StringNumDistinctValueEstimator.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.udf.generic;
-
-public class StringNumDistinctValueEstimator extends NumDistinctValueEstimator {
-
- public StringNumDistinctValueEstimator(int numVectors) {
- super(numVectors);
- }
-
- public StringNumDistinctValueEstimator(String s, int numVectors) {
- super(s, numVectors);
- }
-
- public void addToEstimator(String s) {
- int v = s.hashCode();
- super.addToEstimator(v);
- }
-
- public void addToEstimatorPCSA(String s) {
- int v = s.hashCode();
- super.addToEstimatorPCSA(v);
- }
-}
diff --git a/ql/src/test/queries/clientpositive/char_udf1.q b/ql/src/test/queries/clientpositive/char_udf1.q
index 39aa0e0e17..fa3a261c4b 100644
--- a/ql/src/test/queries/clientpositive/char_udf1.q
+++ b/ql/src/test/queries/clientpositive/char_udf1.q
@@ -142,8 +142,8 @@ from char_udf_1 limit 1;
-- Aggregate Functions
select
- compute_stats(c2, 16),
- compute_stats(c4, 16)
+ compute_stats(c2, 'fm', 16),
+ compute_stats(c4, 'fm', 16)
from char_udf_1;
select
diff --git a/ql/src/test/queries/clientpositive/compute_stats_date.q b/ql/src/test/queries/clientpositive/compute_stats_date.q
index 09128f6fb9..bf478526ba 100644
--- a/ql/src/test/queries/clientpositive/compute_stats_date.q
+++ b/ql/src/test/queries/clientpositive/compute_stats_date.q
@@ -13,7 +13,7 @@ load data local inpath '../../data/files/flights_join.txt' overwrite into table
select count(*) from tab_date;
-- compute statistical summary of data
-select compute_stats(fl_date, 16) from tab_date;
+select compute_stats(fl_date, 'hll') from tab_date;
explain
analyze table tab_date compute statistics for columns fl_date;
diff --git a/ql/src/test/queries/clientpositive/compute_stats_decimal.q b/ql/src/test/queries/clientpositive/compute_stats_decimal.q
index 76e1468ada..2beafaf219 100644
--- a/ql/src/test/queries/clientpositive/compute_stats_decimal.q
+++ b/ql/src/test/queries/clientpositive/compute_stats_decimal.q
@@ -8,4 +8,4 @@ LOAD DATA LOCAL INPATH "../../data/files/decimal.txt" INTO TABLE tab_decimal;
select count(*) from tab_decimal;
-- compute statistical summary of data
-select compute_stats(a, 18) from tab_decimal;
+select compute_stats(a, 'fm', 18) from tab_decimal;
diff --git a/ql/src/test/queries/clientpositive/compute_stats_double.q b/ql/src/test/queries/clientpositive/compute_stats_double.q
index 7a1e0f6295..6bae0643a8 100644
--- a/ql/src/test/queries/clientpositive/compute_stats_double.q
+++ b/ql/src/test/queries/clientpositive/compute_stats_double.q
@@ -6,4 +6,4 @@ LOAD DATA LOCAL INPATH "../../data/files/double.txt" INTO TABLE tab_double;
select count(*) from tab_double;
-- compute statistical summary of data
-select compute_stats(a, 16) from tab_double;
+select compute_stats(a, 'fm', 16) from tab_double;
diff --git a/ql/src/test/queries/clientpositive/compute_stats_long.q b/ql/src/test/queries/clientpositive/compute_stats_long.q
index 6a2070f780..48f4ebb979 100644
--- a/ql/src/test/queries/clientpositive/compute_stats_long.q
+++ b/ql/src/test/queries/clientpositive/compute_stats_long.q
@@ -6,4 +6,4 @@ LOAD DATA LOCAL INPATH "../../data/files/int.txt" INTO TABLE tab_int;
select count(*) from tab_int;
-- compute statistical summary of data
-select compute_stats(a, 16) from tab_int;
+select compute_stats(a, 'fm', 16) from tab_int;
diff --git a/ql/src/test/queries/clientpositive/compute_stats_string.q b/ql/src/test/queries/clientpositive/compute_stats_string.q
index 0023e7f6bd..79a531e8ec 100644
--- a/ql/src/test/queries/clientpositive/compute_stats_string.q
+++ b/ql/src/test/queries/clientpositive/compute_stats_string.q
@@ -6,4 +6,4 @@ LOAD DATA LOCAL INPATH "../../data/files/string.txt" INTO TABLE tab_string;
select count(*) from tab_string;
-- compute statistical summary of data
-select compute_stats(a, 16) from tab_string;
+select compute_stats(a, 'fm', 16) from tab_string;
diff --git a/ql/src/test/queries/clientpositive/hll.q b/ql/src/test/queries/clientpositive/hll.q
new file mode 100644
index 0000000000..edfdce8a29
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/hll.q
@@ -0,0 +1,46 @@
+set hive.mapred.mode=nonstrict;
+
+create table i(key int);
+
+insert overwrite table i select key from src;
+
+explain analyze table i compute statistics for columns;
+
+analyze table i compute statistics for columns;
+
+desc formatted i key;
+
+drop table i;
+
+create table i(key double);
+
+insert overwrite table i select key from src;
+
+analyze table i compute statistics for columns;
+
+desc formatted i key;
+
+drop table i;
+
+create table i(key decimal);
+
+insert overwrite table i select key from src;
+
+analyze table i compute statistics for columns;
+
+desc formatted i key;
+
+drop table i;
+
+create table i(key date);
+
+insert into i values ('2012-08-17');
+insert into i values ('2012-08-17');
+insert into i values ('2013-08-17');
+insert into i values ('2012-03-17');
+insert into i values ('2012-05-17');
+
+analyze table i compute statistics for columns;
+
+desc formatted i key;
+
diff --git a/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
index 8bbae3914d..d72fad5c0f 100644
--- a/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
+++ b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
@@ -7,7 +7,7 @@ set hive.groupby.skewindata=false;
set mapred.reduce.tasks=31;
-select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+select compute_stats(a,'fm',16),compute_stats(b,'fm',16),compute_stats(c,'fm',16),compute_stats(d,'fm',16)
from
(
select
@@ -17,7 +17,7 @@ select
var_samp(substr(src.value,5)) as d
from src)subq;
-explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+explain select compute_stats(a,'fm',16),compute_stats(b,'fm',16),compute_stats(c,'fm',16),compute_stats(d,'fm',16)
from
(
select
@@ -27,7 +27,7 @@ select
var_samp(substr(src.value,5)) as d
from src)subq;
-select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+select compute_stats(a,'fm',16),compute_stats(b,'fm',16),compute_stats(c,'fm',16),compute_stats(d,'fm',16)
from
(
select
@@ -39,7 +39,7 @@ select
set hive.optimize.reducededuplication=false;
-explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+explain select compute_stats(a,'fm',16),compute_stats(b,'fm',16),compute_stats(c,'fm',16),compute_stats(d,'fm',16)
from
(
select
@@ -49,7 +49,7 @@ select
var_samp(substr(src.value,5)) as d
from src)subq;
-select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+select compute_stats(a,'fm',16),compute_stats(b,'fm',16),compute_stats(c,'fm',16),compute_stats(d,'fm',16)
from
(
select
diff --git a/ql/src/test/queries/clientpositive/varchar_udf1.q b/ql/src/test/queries/clientpositive/varchar_udf1.q
index 4d1f884ea7..1039ed9848 100644
--- a/ql/src/test/queries/clientpositive/varchar_udf1.q
+++ b/ql/src/test/queries/clientpositive/varchar_udf1.q
@@ -139,8 +139,8 @@ from varchar_udf_1 limit 1;
-- Aggregate Functions
select
- compute_stats(c2, 16),
- compute_stats(c4, 16)
+ compute_stats(c2, 'fm', 16),
+ compute_stats(c4, 'fm', 16)
from varchar_udf_1;
select
diff --git a/ql/src/test/queries/clientpositive/vector_udf1.q b/ql/src/test/queries/clientpositive/vector_udf1.q
index 48d3e1ee4d..c1d43725d2 100644
--- a/ql/src/test/queries/clientpositive/vector_udf1.q
+++ b/ql/src/test/queries/clientpositive/vector_udf1.q
@@ -351,8 +351,8 @@ select
from varchar_udf_1;
select
- compute_stats(c2, 16),
- compute_stats(c4, 16)
+ compute_stats(c2, 'fm', 16),
+ compute_stats(c4, 'fm', 16)
from varchar_udf_1;
explain vectorization detail
diff --git a/ql/src/test/results/clientpositive/alter_partition_update_status.q.out b/ql/src/test/results/clientpositive/alter_partition_update_status.q.out
index 922822e6d2..c0d4eeefb4 100644
--- a/ql/src/test/results/clientpositive/alter_partition_update_status.q.out
+++ b/ql/src/test/results/clientpositive/alter_partition_update_status.q.out
@@ -36,7 +36,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat_part_one
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 14 1.72 3 from deserializer
+key string 0 16 1.72 3 from deserializer
PREHOOK: query: ALTER TABLE src_stat_part_one PARTITION(partitionId=1) UPDATE STATISTICS for column key SET ('numDVs'='11','avgColLen'='2.2')
PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS
POSTHOOK: query: ALTER TABLE src_stat_part_one PARTITION(partitionId=1) UPDATE STATISTICS for column key SET ('numDVs'='11','avgColLen'='2.2')
@@ -88,7 +88,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat_part_two
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 14 1.72 3 from deserializer
+key string 0 16 1.72 3 from deserializer
PREHOOK: query: ALTER TABLE src_stat_part_two PARTITION(px=1, py='a') UPDATE STATISTICS for column key SET ('numDVs'='30','maxColLen'='40')
PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS
POSTHOOK: query: ALTER TABLE src_stat_part_two PARTITION(px=1, py='a') UPDATE STATISTICS for column key SET ('numDVs'='30','maxColLen'='40')
diff --git a/ql/src/test/results/clientpositive/alter_table_column_stats.q.out b/ql/src/test/results/clientpositive/alter_table_column_stats.q.out
index 2cc7cbc7b6..96dce1e2c5 100644
--- a/ql/src/test/results/clientpositive/alter_table_column_stats.q.out
+++ b/ql/src/test/results/clientpositive/alter_table_column_stats.q.out
@@ -125,7 +125,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable0 col2
PREHOOK: type: DESCTABLE
@@ -135,7 +135,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable0 col3
PREHOOK: type: DESCTABLE
@@ -201,7 +201,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col2
PREHOOK: type: DESCTABLE
@@ -211,7 +211,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col3
PREHOOK: type: DESCTABLE
@@ -276,7 +276,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col2
PREHOOK: type: DESCTABLE
@@ -286,7 +286,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col4
PREHOOK: type: DESCTABLE
@@ -361,7 +361,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col4
PREHOOK: type: DESCTABLE
@@ -437,7 +437,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb2@testtable2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb2.testtable2 col4
PREHOOK: type: DESCTABLE
@@ -551,7 +551,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part1') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -560,7 +560,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part1') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -618,7 +618,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -627,7 +627,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part2') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -737,7 +737,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part1') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -746,7 +746,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part1') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -804,7 +804,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -813,7 +813,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -924,7 +924,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -933,7 +933,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -991,7 +991,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1000,7 +1000,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1113,7 +1113,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1122,7 +1122,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1180,7 +1180,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1189,7 +1189,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1311,7 +1311,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1378,7 +1378,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -1457,7 +1457,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb2@testpart2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb2.testpart2 partition (part = 'part11') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb2@testpart2
@@ -1484,7 +1484,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb2@testpart2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb2.testpart2 partition (part = 'part2') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb2@testpart2
@@ -1665,7 +1665,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable0 col2
PREHOOK: type: DESCTABLE
@@ -1675,7 +1675,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable0 col3
PREHOOK: type: DESCTABLE
@@ -1741,7 +1741,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col2
PREHOOK: type: DESCTABLE
@@ -1751,7 +1751,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\",\"col3\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col3
PREHOOK: type: DESCTABLE
@@ -1816,7 +1816,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col2
PREHOOK: type: DESCTABLE
@@ -1826,7 +1826,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\",\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col4
PREHOOK: type: DESCTABLE
@@ -1901,7 +1901,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testtable1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb1.testtable1 col4
PREHOOK: type: DESCTABLE
@@ -1977,7 +1977,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb2@testtable2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col2\":\"true\"}}
PREHOOK: query: describe formatted statsdb2.testtable2 col4
PREHOOK: type: DESCTABLE
@@ -2091,7 +2091,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part1') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -2100,7 +2100,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part1') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -2158,7 +2158,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -2167,7 +2167,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart0
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart0 partition (part = 'part2') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart0
@@ -2277,7 +2277,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part1') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2286,7 +2286,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part1') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2344,7 +2344,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2353,7 +2353,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2464,7 +2464,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2473,7 +2473,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2531,7 +2531,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2540,7 +2540,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col3
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2653,7 +2653,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 8 from deserializer
+col1 int 27 484 0 10 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2662,7 +2662,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2720,7 +2720,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col1 int 27 484 0 18 from deserializer
+col1 int 27 484 0 20 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col2
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2729,7 +2729,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2851,7 +2851,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part11') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2918,7 +2918,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb1@testpart1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb1.testpart1 partition (part = 'part2') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb1@testpart1
@@ -2997,7 +2997,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb2@testpart2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 12 6.7 7 from deserializer
+col2 string 0 10 6.7 7 from deserializer
PREHOOK: query: describe formatted statsdb2.testpart2 partition (part = 'part11') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb2@testpart2
@@ -3024,7 +3024,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: statsdb2@testpart2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-col2 string 0 18 6.8 7 from deserializer
+col2 string 0 20 6.8 7 from deserializer
PREHOOK: query: describe formatted statsdb2.testpart2 partition (part = 'part2') col4
PREHOOK: type: DESCTABLE
PREHOOK: Input: statsdb2@testpart2
diff --git a/ql/src/test/results/clientpositive/alter_table_update_status.q.out b/ql/src/test/results/clientpositive/alter_table_update_status.q.out
index e26e8cba1c..9cd9a8dbe0 100644
--- a/ql/src/test/results/clientpositive/alter_table_update_status.q.out
+++ b/ql/src/test/results/clientpositive/alter_table_update_status.q.out
@@ -48,7 +48,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 14 1.72 3 from deserializer
+key string 0 16 1.72 3 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}}
PREHOOK: query: ALTER TABLE src_stat UPDATE STATISTICS for column key SET ('numDVs'='1111','avgColLen'='1.111')
PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
@@ -94,7 +94,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat_int
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key double 66.0 406.0 10 14 from deserializer
+key double 66.0 406.0 10 15 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"key\":\"true\"}}
PREHOOK: query: ALTER TABLE src_stat_int UPDATE STATISTICS for column key SET ('numDVs'='2222','lowValue'='333.22','highValue'='22.22')
PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
diff --git a/ql/src/test/results/clientpositive/analyze_tbl_part.q.out b/ql/src/test/results/clientpositive/analyze_tbl_part.q.out
index ed90b6fc92..6a3fbc0cc7 100644
--- a/ql/src/test/results/clientpositive/analyze_tbl_part.q.out
+++ b/ql/src/test/results/clientpositive/analyze_tbl_part.q.out
@@ -50,7 +50,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 14 1.72 3 from deserializer
+key string 0 16 1.72 3 from deserializer
PREHOOK: query: ANALYZE TABLE src_stat_part partition (partitionId) COMPUTE STATISTICS for columns key, value
PREHOOK: type: QUERY
PREHOOK: Input: default@src_stat_part
@@ -71,7 +71,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 14 1.72 3 from deserializer
+key string 0 16 1.72 3 from deserializer
PREHOOK: query: describe formatted src_stat_part PARTITION(partitionId=2) value
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@src_stat_part
@@ -80,7 +80,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_stat_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value string 0 14 4.92 7 from deserializer
+value string 0 19 4.92 7 from deserializer
PREHOOK: query: create table src_stat_string_part(key string, value string) partitioned by (partitionName string)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
diff --git a/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out b/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
index 95dd6abaec..6e2975e671 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
@@ -121,9 +121,9 @@ STAGE PLANS:
Statistics: Num rows: 2098 Data size: 16744 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (((t = 1) and (si = 2)) or ((t = 2) and (si = 3)) or ((t = 3) and (si = 4)) or ((t = 4) and (si = 5)) or ((t = 5) and (si = 6)) or ((t = 6) and (si = 7)) or ((t = 7) and (si = 8)) or ((t = 9) and (si = 10)) or ((t = 10) and (si = 11)) or ((t = 11) and (si = 12)) or ((t = 12) and (si = 13)) or ((t = 13) and (si = 14)) or ((t = 14) and (si = 15)) or ((t = 15) and (si = 16)) or ((t = 16) and (si = 17)) or ((t = 17) and (si = 18)) or ((t = 27) and (si = 28)) or ((t = 37) and (si = 38)) or ((t = 47) and (si = 48)) or ((t = 52) and (si = 53))) (type: boolean)
- Statistics: Num rows: 300 Data size: 2400 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 160 Data size: 1280 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
- Statistics: Num rows: 300 Data size: 2400 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 160 Data size: 1280 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
diff --git a/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out b/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out
index a8e4854a00..fccfabd5d1 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out
@@ -129,13 +129,13 @@ STAGE PLANS:
keys: KEY._col0 (type: string), KEY._col1 (type: int)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 7 Data size: 658 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 564 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: min(_col1)
keys: _col0 (type: string), _col2 (type: bigint)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
table:
@@ -151,7 +151,7 @@ STAGE PLANS:
key expressions: _col0 (type: string), _col1 (type: bigint)
sort order: ++
Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint)
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col2 (type: int)
Reduce Operator Tree:
Group By Operator
@@ -159,10 +159,10 @@ STAGE PLANS:
keys: KEY._col0 (type: string), KEY._col1 (type: bigint)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/annotate_stats_join.q.out b/ql/src/test/results/clientpositive/annotate_stats_join.q.out
index c1a140b558..736016f538 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_join.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_join.q.out
@@ -236,10 +236,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col1 (type: string), _col0 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -302,10 +302,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col1 (type: string), _col0 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -368,10 +368,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col1 (type: string), _col0 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -538,10 +538,10 @@ STAGE PLANS:
1 _col0 (type: int)
2 _col1 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
- Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 64 Data size: 18944 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 64 Data size: 18944 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -682,10 +682,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col1 (type: string), _col0 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 48 Data size: 5417 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 48 Data size: 5607 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 48 Data size: 5417 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 48 Data size: 5607 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -753,10 +753,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col0 (type: string), _col1 (type: int)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 3 Data size: 297 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 396 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 3 Data size: 297 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 396 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -813,10 +813,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col1 (type: string), _col0 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 1552 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -873,10 +873,10 @@ STAGE PLANS:
0 _col0 (type: string), _col1 (type: int)
1 _col1 (type: string), _col0 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 54 Data size: 1358 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 54 Data size: 1746 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 54 Data size: 1358 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 54 Data size: 1746 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out b/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
index f7d73c9ddf..e04c1c6bc5 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
@@ -807,14 +807,14 @@ STAGE PLANS:
1 _col0 (type: int)
2 _col0 (type: int)
outputColumnNames: _col1
- Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col1 (type: int)
outputColumnNames: _col0
- Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -979,14 +979,14 @@ STAGE PLANS:
1 _col0 (type: int)
2 _col0 (type: int)
outputColumnNames: _col1
- Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col1 (type: int)
outputColumnNames: _col0
- Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -1065,14 +1065,14 @@ STAGE PLANS:
1 _col0 (type: int)
2 _col0 (type: int)
outputColumnNames: _col2
- Statistics: Num rows: 273 Data size: 1092 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 321 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col2 (type: int)
outputColumnNames: _col0
- Statistics: Num rows: 273 Data size: 1092 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 321 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 273 Data size: 1092 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 321 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -1177,14 +1177,14 @@ STAGE PLANS:
0 _col0 (type: int)
1 _col0 (type: int)
outputColumnNames: _col2
- Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 241 Data size: 964 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col2 (type: int)
outputColumnNames: _col0
- Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 241 Data size: 964 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 241 Data size: 964 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/autoColumnStats_4.q.out b/ql/src/test/results/clientpositive/autoColumnStats_4.q.out
index fe3b9e53ef..e84499995b 100644
--- a/ql/src/test/results/clientpositive/autoColumnStats_4.q.out
+++ b/ql/src/test/results/clientpositive/autoColumnStats_4.q.out
@@ -116,10 +116,10 @@ STAGE PLANS:
outputColumnNames: a, b
Statistics: Num rows: 10 Data size: 2150 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(a, 16), compute_stats(b, 16)
+ aggregations: compute_stats(a, 'hll'), compute_stats(b, 'hll')
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
@@ -153,17 +153,17 @@ STAGE PLANS:
TableScan
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct)
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
mode: mergepartial
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/autoColumnStats_5.q.out b/ql/src/test/results/clientpositive/autoColumnStats_5.q.out
index e19fb5f504..e3abba5bd0 100644
--- a/ql/src/test/results/clientpositive/autoColumnStats_5.q.out
+++ b/ql/src/test/results/clientpositive/autoColumnStats_5.q.out
@@ -46,7 +46,7 @@ STAGE PLANS:
outputColumnNames: a, b, part
Statistics: Num rows: 1 Data size: 44 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(a, 16), compute_stats(b, 16)
+ aggregations: compute_stats(a, 'hll'), compute_stats(b, 'hll')
keys: part (type: int)
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -56,7 +56,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 1 Data size: 44 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct), _col2 (type: struct)
+ value expressions: _col1 (type: struct), _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
@@ -195,7 +195,7 @@ POSTHOOK: Input: default@partitioned1
col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-a int 1 4 0 5 from deserializer
+a int 1 4 0 4 from deserializer
PREHOOK: query: alter table partitioned1 add columns(c int, d string)
PREHOOK: type: ALTERTABLE_ADDCOLS
PREHOOK: Input: default@partitioned1
@@ -284,7 +284,7 @@ STAGE PLANS:
outputColumnNames: a, b, c, d, part
Statistics: Num rows: 1 Data size: 60 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(a, 16), compute_stats(b, 16), compute_stats(c, 16), compute_stats(d, 16)
+ aggregations: compute_stats(a, 'hll'), compute_stats(b, 'hll'), compute_stats(c, 'hll'), compute_stats(d, 'hll')
keys: part (type: int)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4
@@ -294,7 +294,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 1 Data size: 60 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct)
+ value expressions: _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2), compute_stats(VALUE._col3)
@@ -478,7 +478,7 @@ STAGE PLANS:
outputColumnNames: a, b, c, d, part
Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(a, 16), compute_stats(b, 16), compute_stats(c, 16), compute_stats(d, 16)
+ aggregations: compute_stats(a, 'hll'), compute_stats(b, 'hll'), compute_stats(c, 'hll'), compute_stats(d, 'hll')
keys: part (type: int)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4
@@ -488,7 +488,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct)
+ value expressions: _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2), compute_stats(VALUE._col3)
@@ -629,7 +629,7 @@ POSTHOOK: Input: default@partitioned1
col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-a int 1 6 0 5 from deserializer
+a int 1 6 0 4 from deserializer
PREHOOK: query: desc formatted partitioned1 partition(part=1) c
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@partitioned1
@@ -639,4 +639,4 @@ POSTHOOK: Input: default@partitioned1
col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-c int 100 200 0 3 from deserializer
+c int 100 200 0 2 from deserializer
diff --git a/ql/src/test/results/clientpositive/autoColumnStats_6.q.out b/ql/src/test/results/clientpositive/autoColumnStats_6.q.out
index 29b3373e10..1b125701d7 100644
--- a/ql/src/test/results/clientpositive/autoColumnStats_6.q.out
+++ b/ql/src/test/results/clientpositive/autoColumnStats_6.q.out
@@ -59,7 +59,7 @@ STAGE PLANS:
outputColumnNames: key, value, one, two, three
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(key, 16), compute_stats(value, 16)
+ aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll')
keys: one (type: string), two (type: string), three (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4
@@ -69,7 +69,7 @@ STAGE PLANS:
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string), _col2 (type: string)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col3 (type: struct), _col4 (type: struct)
+ value expressions: _col3 (type: struct), _col4 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
diff --git a/ql/src/test/results/clientpositive/autoColumnStats_7.q.out b/ql/src/test/results/clientpositive/autoColumnStats_7.q.out
index 9d24bc53ab..9e2121e0de 100644
--- a/ql/src/test/results/clientpositive/autoColumnStats_7.q.out
+++ b/ql/src/test/results/clientpositive/autoColumnStats_7.q.out
@@ -132,10 +132,10 @@ STAGE PLANS:
value expressions: key (type: string), c1 (type: int), c2 (type: string)
Reduce Operator Tree:
Group By Operator
- aggregations: compute_stats(VALUE._col0, 16), compute_stats(VALUE._col2, 16), compute_stats(VALUE._col3, 16)
+ aggregations: compute_stats(VALUE._col0, 'hll'), compute_stats(VALUE._col2, 'hll'), compute_stats(VALUE._col3, 'hll')
mode: partial1
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1460 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1424 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
@@ -149,17 +149,17 @@ STAGE PLANS:
TableScan
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 1460 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
+ Statistics: Num rows: 1 Data size: 1424 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2)
mode: final
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1464 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 1464 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/autoColumnStats_8.q.out b/ql/src/test/results/clientpositive/autoColumnStats_8.q.out
index 681d962ed0..cdf2082d53 100644
--- a/ql/src/test/results/clientpositive/autoColumnStats_8.q.out
+++ b/ql/src/test/results/clientpositive/autoColumnStats_8.q.out
@@ -104,7 +104,7 @@ STAGE PLANS:
outputColumnNames: key, value, ds, hr
Statistics: Num rows: 666 Data size: 7075 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(key, 16), compute_stats(value, 16)
+ aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll')
keys: ds (type: string), hr (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
@@ -116,7 +116,7 @@ STAGE PLANS:
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
Statistics: Num rows: 666 Data size: 7075 Basic stats: COMPLETE Column stats: NONE
tag: -1
- value expressions: _col2 (type: struct), _col3 (type: struct)
+ value expressions: _col2 (type: struct), _col3 (type: struct)
auto parallelism: false
Filter Operator
isSamplingPred: false
@@ -161,7 +161,7 @@ STAGE PLANS:
outputColumnNames: key, value, hr
Statistics: Num rows: 666 Data size: 7075 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(key, 16), compute_stats(value, 16)
+ aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll')
keys: '2008-12-31' (type: string), hr (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
@@ -177,7 +177,7 @@ STAGE PLANS:
properties:
column.name.delimiter ,
columns _col0,_col1,_col2,_col3
- columns.types string,string,struct,struct
+ columns.types string,string,struct,struct
escape.delim \
serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
@@ -511,7 +511,7 @@ STAGE PLANS:
Map-reduce partition columns: '2008-12-31' (type: string), _col1 (type: string)
Statistics: Num rows: 666 Data size: 7075 Basic stats: COMPLETE Column stats: NONE
tag: -1
- value expressions: _col2 (type: struct), _col3 (type: struct)
+ value expressions: _col2 (type: struct), _col3 (type: struct)
auto parallelism: false
Path -> Alias:
#### A masked pattern was here ####
@@ -524,7 +524,7 @@ STAGE PLANS:
properties:
column.name.delimiter ,
columns _col0,_col1,_col2,_col3
- columns.types string,string,struct,struct
+ columns.types string,string,struct,struct
escape.delim \
serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
@@ -534,7 +534,7 @@ STAGE PLANS:
properties:
column.name.delimiter ,
columns _col0,_col1,_col2,_col3
- columns.types string,string,struct,struct
+ columns.types string,string,struct,struct
escape.delim \
serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
diff --git a/ql/src/test/results/clientpositive/autoColumnStats_9.q.out b/ql/src/test/results/clientpositive/autoColumnStats_9.q.out
index d26e2c02b7..06f23b1e7c 100644
--- a/ql/src/test/results/clientpositive/autoColumnStats_9.q.out
+++ b/ql/src/test/results/clientpositive/autoColumnStats_9.q.out
@@ -86,10 +86,10 @@ STAGE PLANS:
outputColumnNames: key, value
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(key, 16), compute_stats(value, 16)
+ aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll')
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
@@ -142,10 +142,10 @@ STAGE PLANS:
outputColumnNames: key, value
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(key, 16), compute_stats(value, 16)
+ aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll')
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
@@ -181,17 +181,17 @@ STAGE PLANS:
TableScan
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct)
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
mode: mergepartial
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -252,7 +252,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@dest_j1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key int 0 498 0 196 from deserializer
+key int 0 498 0 309 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
PREHOOK: query: desc formatted dest_j1 value
PREHOOK: type: DESCTABLE
@@ -262,5 +262,5 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@dest_j1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value string 0 214 6.834630350194552 7 from deserializer
+value string 0 309 6.834630350194552 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
diff --git a/ql/src/test/results/clientpositive/auto_join_without_localtask.q.out b/ql/src/test/results/clientpositive/auto_join_without_localtask.q.out
index 17a912ec13..57f00674de 100644
--- a/ql/src/test/results/clientpositive/auto_join_without_localtask.q.out
+++ b/ql/src/test/results/clientpositive/auto_join_without_localtask.q.out
@@ -285,24 +285,24 @@ STAGE PLANS:
Stage: Stage-14
Map Reduce Local Work
Alias -> Map Local Tables:
- $hdt$_1:c
+ $hdt$_1:b
Fetch Operator
limit: -1
Alias -> Map Local Operator Tree:
- $hdt$_1:c
+ $hdt$_1:b
TableScan
- alias: c
+ alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: value is not null (type: boolean)
+ predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: value (type: string)
+ expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
HashTable Sink Operator
keys:
- 0 _col1 (type: string)
+ 0 _col0 (type: string)
1 _col0 (type: string)
Stage: Stage-9
@@ -322,7 +322,7 @@ STAGE PLANS:
condition map:
Inner Join 0 to 1
keys:
- 0 _col1 (type: string)
+ 0 _col0 (type: string)
1 _col0 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
@@ -341,24 +341,24 @@ STAGE PLANS:
Stage: Stage-12
Map Reduce Local Work
Alias -> Map Local Tables:
- $hdt$_2:b
+ $hdt$_2:c
Fetch Operator
limit: -1
Alias -> Map Local Operator Tree:
- $hdt$_2:b
+ $hdt$_2:c
TableScan
- alias: b
+ alias: c
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: key is not null (type: boolean)
+ predicate: value is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: key (type: string)
+ expressions: value (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
HashTable Sink Operator
keys:
- 0 _col0 (type: string)
+ 0 _col1 (type: string)
1 _col0 (type: string)
Stage: Stage-6
@@ -369,7 +369,7 @@ STAGE PLANS:
condition map:
Inner Join 0 to 1
keys:
- 0 _col0 (type: string)
+ 0 _col1 (type: string)
1 _col0 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE
@@ -411,20 +411,20 @@ STAGE PLANS:
Map Reduce
Map Operator Tree:
TableScan
- alias: b
+ alias: c
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: key is not null (type: boolean)
+ predicate: value is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: key (type: string)
+ expressions: value (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Map Join Operator
condition map:
Inner Join 0 to 1
keys:
- 0 _col0 (type: string)
+ 0 _col1 (type: string)
1 _col0 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE
@@ -449,19 +449,19 @@ STAGE PLANS:
Map Operator Tree:
TableScan
Reduce Output Operator
- key expressions: _col0 (type: string)
+ key expressions: _col1 (type: string)
sort order: +
- Map-reduce partition columns: _col0 (type: string)
+ Map-reduce partition columns: _col1 (type: string)
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: string)
+ value expressions: _col0 (type: string)
TableScan
- alias: b
+ alias: c
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: key is not null (type: boolean)
+ predicate: value is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: key (type: string)
+ expressions: value (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
@@ -474,7 +474,7 @@ STAGE PLANS:
condition map:
Inner Join 0 to 1
keys:
- 0 _col0 (type: string)
+ 0 _col1 (type: string)
1 _col0 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE
@@ -505,27 +505,27 @@ STAGE PLANS:
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
HashTable Sink Operator
keys:
- 0 _col1 (type: string)
+ 0 _col0 (type: string)
1 _col0 (type: string)
Stage: Stage-10
Map Reduce
Map Operator Tree:
TableScan
- alias: c
+ alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: value is not null (type: boolean)
+ predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: value (type: string)
+ expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Map Join Operator
condition map:
Inner Join 0 to 1
keys:
- 0 _col1 (type: string)
+ 0 _col0 (type: string)
1 _col0 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
@@ -552,19 +552,19 @@ STAGE PLANS:
outputColumnNames: _col0, _col1
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- key expressions: _col1 (type: string)
+ key expressions: _col0 (type: string)
sort order: +
- Map-reduce partition columns: _col1 (type: string)
+ Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: string)
+ value expressions: _col1 (type: string)
TableScan
- alias: c
+ alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: value is not null (type: boolean)
+ predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: value (type: string)
+ expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
@@ -577,7 +577,7 @@ STAGE PLANS:
condition map:
Inner Join 0 to 1
keys:
- 0 _col1 (type: string)
+ 0 _col0 (type: string)
1 _col0 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
diff --git a/ql/src/test/results/clientpositive/avro_decimal.q.out b/ql/src/test/results/clientpositive/avro_decimal.q.out
index 5a3b72defe..e1045ebea1 100644
--- a/ql/src/test/results/clientpositive/avro_decimal.q.out
+++ b/ql/src/test/results/clientpositive/avro_decimal.q.out
@@ -34,7 +34,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@dec
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value decimal(8,4) -12.25 234.79 0 6 from deserializer
+value decimal(8,4) -12.25 234.79 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"value\":\"true\"}}
PREHOOK: query: DROP TABLE IF EXISTS avro_dec
PREHOOK: type: DROPTABLE
diff --git a/ql/src/test/results/clientpositive/avro_decimal_native.q.out b/ql/src/test/results/clientpositive/avro_decimal_native.q.out
index fe77512191..b73b5f5679 100644
--- a/ql/src/test/results/clientpositive/avro_decimal_native.q.out
+++ b/ql/src/test/results/clientpositive/avro_decimal_native.q.out
@@ -38,7 +38,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@dec
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value decimal(8,4) -12.25 234.79 0 6 from deserializer
+value decimal(8,4) -12.25 234.79 0 10 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"value\":\"true\"}}
PREHOOK: query: DROP TABLE IF EXISTS avro_dec
PREHOOK: type: DROPTABLE
diff --git a/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out b/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out
index f260f034b6..23f5fcfc76 100644
--- a/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out
+++ b/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out
@@ -129,13 +129,13 @@ STAGE PLANS:
keys: KEY._col0 (type: string), KEY._col1 (type: int)
mode: mergepartial
outputColumnNames: state, locid, $f2
- Statistics: Num rows: 7 Data size: 658 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 564 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: min(locid)
keys: state (type: string), $f2 (type: bigint)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
table:
@@ -151,7 +151,7 @@ STAGE PLANS:
key expressions: _col0 (type: string), _col1 (type: bigint)
sort order: ++
Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint)
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col2 (type: int)
Reduce Operator Tree:
Group By Operator
@@ -159,10 +159,10 @@ STAGE PLANS:
keys: KEY._col0 (type: string), KEY._col1 (type: bigint)
mode: mergepartial
outputColumnNames: state, $f2, $f2_0
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
- Statistics: Num rows: 7 Data size: 686 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 588 Basic stats: COMPLETE Column stats: PARTIAL
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/cbo_rp_join0.q.out b/ql/src/test/results/clientpositive/cbo_rp_join0.q.out
index b9cf3ceab4..29499a1f54 100644
--- a/ql/src/test/results/clientpositive/cbo_rp_join0.q.out
+++ b/ql/src/test/results/clientpositive/cbo_rp_join0.q.out
@@ -68,14 +68,14 @@ STAGE PLANS:
1 key (type: string)
2 key (type: string)
outputColumnNames: key, c_int, key0, c_int0
- Statistics: Num rows: 324 Data size: 57494 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 216 Data size: 38270 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: key (type: string), c_int (type: int), key0 (type: string), c_int0 (type: int)
outputColumnNames: key, c_int, p, q
- Statistics: Num rows: 324 Data size: 57494 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 216 Data size: 38270 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 324 Data size: 57494 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 216 Data size: 38270 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -730,14 +730,14 @@ STAGE PLANS:
2 key (type: string)
3 key (type: string)
outputColumnNames: key, c_int, key0, c_int0, key1, c_int2
- Statistics: Num rows: 1620 Data size: 432273 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1080 Data size: 288093 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: key (type: string), c_int (type: int), key0 (type: string), c_int0 (type: int), key1 (type: string), c_int2 (type: int)
outputColumnNames: key, c_int, p, q, x, b
- Statistics: Num rows: 1620 Data size: 432273 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1080 Data size: 288093 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 1620 Data size: 432273 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1080 Data size: 288093 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/char_udf1.q.out b/ql/src/test/results/clientpositive/char_udf1.q.out
index 07ce108a75..fefc7407e0 100644
--- a/ql/src/test/results/clientpositive/char_udf1.q.out
+++ b/ql/src/test/results/clientpositive/char_udf1.q.out
@@ -393,15 +393,15 @@ POSTHOOK: Input: default@char_udf_1
#### A masked pattern was here ####
val_238 val_238 true
PREHOOK: query: select
- compute_stats(c2, 16),
- compute_stats(c4, 16)
+ compute_stats(c2, 'fm', 16),
+ compute_stats(c4, 'fm', 16)
from char_udf_1
PREHOOK: type: QUERY
PREHOOK: Input: default@char_udf_1
#### A masked pattern was here ####
POSTHOOK: query: select
- compute_stats(c2, 16),
- compute_stats(c4, 16)
+ compute_stats(c2, 'fm', 16),
+ compute_stats(c4, 'fm', 16)
from char_udf_1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@char_udf_1
diff --git a/ql/src/test/results/clientpositive/colstats_all_nulls.q.out b/ql/src/test/results/clientpositive/colstats_all_nulls.q.out
index 14c5d5b59b..0f2822504f 100644
--- a/ql/src/test/results/clientpositive/colstats_all_nulls.q.out
+++ b/ql/src/test/results/clientpositive/colstats_all_nulls.q.out
@@ -43,7 +43,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@all_nulls
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-a bigint 0 0 5 1 from deserializer
+a bigint 0 0 5 0 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
PREHOOK: query: describe formatted all_nulls b
PREHOOK: type: DESCTABLE
@@ -53,7 +53,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@all_nulls
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-b double 0.0 0.0 5 1 from deserializer
+b double 0.0 0.0 5 0 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
PREHOOK: query: drop table all_nulls
PREHOOK: type: DROPTABLE
diff --git a/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out b/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out
index 96feeed49c..9925928da7 100644
--- a/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out
+++ b/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out
@@ -108,7 +108,7 @@ STAGE PLANS:
value expressions: key (type: int), value (type: string)
Reduce Operator Tree:
Group By Operator
- aggregations: compute_stats(VALUE._col0, 16), compute_stats(VALUE._col2, 16)
+ aggregations: compute_stats(VALUE._col0, 'hll'), compute_stats(VALUE._col2, 'hll')
mode: complete
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
diff --git a/ql/src/test/results/clientpositive/columnstats_partlvl.q.out b/ql/src/test/results/clientpositive/columnstats_partlvl.q.out
index 07d26e92bb..5ecb20501b 100644
--- a/ql/src/test/results/clientpositive/columnstats_partlvl.q.out
+++ b/ql/src/test/results/clientpositive/columnstats_partlvl.q.out
@@ -52,7 +52,7 @@ STAGE PLANS:
outputColumnNames: employeeid
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeeid, 'hll')
keys: 2000.0 (type: double)
mode: hash
outputColumnNames: _col0, _col1
@@ -62,7 +62,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: 2000.0 (type: double)
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct)
+ value expressions: _col1 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0)
@@ -112,7 +112,7 @@ STAGE PLANS:
outputColumnNames: employeeid
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeeid, 'hll')
keys: 2000.0 (type: double)
mode: hash
outputColumnNames: _col0, _col1
@@ -124,7 +124,7 @@ STAGE PLANS:
Map-reduce partition columns: 2000.0 (type: double)
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
tag: -1
- value expressions: _col1 (type: struct)
+ value expressions: _col1 (type: struct)
auto parallelism: false
Path -> Alias:
#### A masked pattern was here ####
@@ -254,7 +254,7 @@ STAGE PLANS:
outputColumnNames: employeeid
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeeid, 'hll')
keys: 4000.0 (type: double)
mode: hash
outputColumnNames: _col0, _col1
@@ -264,7 +264,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: 4000.0 (type: double)
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct)
+ value expressions: _col1 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0)
@@ -314,7 +314,7 @@ STAGE PLANS:
outputColumnNames: employeeid
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeeid, 'hll')
keys: 4000.0 (type: double)
mode: hash
outputColumnNames: _col0, _col1
@@ -326,7 +326,7 @@ STAGE PLANS:
Map-reduce partition columns: 4000.0 (type: double)
Statistics: Num rows: 26 Data size: 105 Basic stats: COMPLETE Column stats: NONE
tag: -1
- value expressions: _col1 (type: struct)
+ value expressions: _col1 (type: struct)
auto parallelism: false
Path -> Alias:
#### A masked pattern was here ####
@@ -456,7 +456,7 @@ STAGE PLANS:
outputColumnNames: employeeid, employeename
Statistics: Num rows: 1 Data size: 105 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16), compute_stats(employeename, 16)
+ aggregations: compute_stats(employeeid, 'hll'), compute_stats(employeename, 'hll')
keys: 2000.0 (type: double)
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -466,7 +466,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: 2000.0 (type: double)
Statistics: Num rows: 1 Data size: 105 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct), _col2 (type: struct)
+ value expressions: _col1 (type: struct), _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
@@ -511,7 +511,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 1 14 from deserializer
+employeeID int 16 34 1 12 from deserializer
PREHOOK: query: describe formatted Employee_Part partition (employeeSalary=2000.0) employeeName
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@employee_part
@@ -520,7 +520,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeName string 1 9 4.3076923076923075 6 from deserializer
+employeeName string 1 12 4.3076923076923075 6 from deserializer
PREHOOK: query: explain
analyze table Employee_Part compute statistics for columns
PREHOOK: type: QUERY
@@ -543,7 +543,7 @@ STAGE PLANS:
outputColumnNames: employeeid, employeename, employeesalary
Statistics: Num rows: 2 Data size: 210 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16), compute_stats(employeename, 16)
+ aggregations: compute_stats(employeeid, 'hll'), compute_stats(employeename, 'hll')
keys: employeesalary (type: double)
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -553,7 +553,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: double)
Statistics: Num rows: 2 Data size: 210 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct), _col2 (type: struct)
+ value expressions: _col1 (type: struct), _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
@@ -600,7 +600,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 1 14 from deserializer
+employeeID int 16 34 1 12 from deserializer
PREHOOK: query: describe formatted Employee_Part partition(employeeSalary=4000.0) employeeID
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@employee_part
@@ -609,7 +609,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 1 14 from deserializer
+employeeID int 16 34 1 12 from deserializer
PREHOOK: query: explain
analyze table Employee_Part compute statistics for columns
PREHOOK: type: QUERY
@@ -632,23 +632,23 @@ STAGE PLANS:
outputColumnNames: employeeid, employeename
Statistics: Num rows: 2 Data size: 210 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16), compute_stats(employeename, 16)
+ aggregations: compute_stats(employeeid, 'hll'), compute_stats(employeename, 'hll')
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct)
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
mode: mergepartial
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -681,7 +681,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 2 14 from deserializer
+employeeID int 16 34 2 12 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"employeeid\":\"true\",\"employeename\":\"true\"}}
PREHOOK: query: create database if not exists dummydb
PREHOOK: type: CREATEDATABASE
@@ -713,7 +713,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 1 14 from deserializer
+employeeID int 16 34 1 12 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"employeeid\":\"true\",\"employeename\":\"true\"}}
PREHOOK: query: analyze table default.Employee_Part compute statistics for columns
PREHOOK: type: QUERY
diff --git a/ql/src/test/results/clientpositive/columnstats_partlvl_dp.q.out b/ql/src/test/results/clientpositive/columnstats_partlvl_dp.q.out
index 468d2e797b..a64c76badf 100644
--- a/ql/src/test/results/clientpositive/columnstats_partlvl_dp.q.out
+++ b/ql/src/test/results/clientpositive/columnstats_partlvl_dp.q.out
@@ -88,7 +88,7 @@ STAGE PLANS:
outputColumnNames: employeeid, employeename, country
Statistics: Num rows: 1 Data size: 64 Basic stats: PARTIAL Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeename, 16), compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeename, 'hll'), compute_stats(employeeid, 'hll')
keys: 4000.0 (type: double), country (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
@@ -98,7 +98,7 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: 4000.0 (type: double), _col1 (type: string)
Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col2 (type: struct), _col3 (type: struct)
+ value expressions: _col2 (type: struct), _col3 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
@@ -143,7 +143,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeName string 0 6 5.142857142857143 6 from deserializer
+employeeName string 0 7 5.142857142857143 6 from deserializer
PREHOOK: query: explain
analyze table Employee_Part partition (employeeSalary='2000.0') compute statistics for columns employeeID
PREHOOK: type: QUERY
@@ -166,7 +166,7 @@ STAGE PLANS:
outputColumnNames: employeeid, country
Statistics: Num rows: 42 Data size: 169 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeeid, 'hll')
keys: 2000.0 (type: double), country (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -176,7 +176,7 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: 2000.0 (type: double), _col1 (type: string)
Statistics: Num rows: 42 Data size: 169 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col2 (type: struct)
+ value expressions: _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0)
@@ -223,7 +223,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 1 14 from deserializer
+employeeID int 16 34 1 12 from deserializer
PREHOOK: query: describe formatted Employee_Part partition (employeeSalary='2000.0', country='UK') employeeID
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@employee_part
@@ -232,7 +232,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 31 0 9 from deserializer
+employeeID int 16 31 0 7 from deserializer
PREHOOK: query: explain
analyze table Employee_Part partition (employeeSalary) compute statistics for columns employeeID
PREHOOK: type: QUERY
@@ -255,7 +255,7 @@ STAGE PLANS:
outputColumnNames: employeeid, employeesalary, country
Statistics: Num rows: 116 Data size: 466 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16)
+ aggregations: compute_stats(employeeid, 'hll')
keys: employeesalary (type: double), country (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -265,7 +265,7 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col0 (type: double), _col1 (type: string)
Statistics: Num rows: 116 Data size: 466 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col2 (type: struct)
+ value expressions: _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0)
@@ -320,7 +320,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeID int 16 34 1 14 from deserializer
+employeeID int 16 34 1 12 from deserializer
PREHOOK: query: explain
analyze table Employee_Part partition (employeeSalary,country) compute statistics for columns
PREHOOK: type: QUERY
@@ -343,7 +343,7 @@ STAGE PLANS:
outputColumnNames: employeeid, employeename, employeesalary, country
Statistics: Num rows: 2 Data size: 466 Basic stats: PARTIAL Column stats: NONE
Group By Operator
- aggregations: compute_stats(employeeid, 16), compute_stats(employeename, 16)
+ aggregations: compute_stats(employeeid, 'hll'), compute_stats(employeename, 'hll')
keys: employeesalary (type: double), country (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
@@ -353,7 +353,7 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col0 (type: double), _col1 (type: string)
Statistics: Num rows: 2 Data size: 466 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col2 (type: struct), _col3 (type: struct)
+ value expressions: _col2 (type: struct), _col3 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
@@ -408,7 +408,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee_part
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeName string 0 6 5.142857142857143 6 from deserializer
+employeeName string 0 12 5.142857142857143 6 from deserializer
PREHOOK: query: drop table Employee
PREHOOK: type: DROPTABLE
POSTHOOK: query: drop table Employee
@@ -483,7 +483,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeName string 0 6 5.142857142857143 6 from deserializer
+employeeName string 0 12 5.142857142857143 6 from deserializer
PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/employee2.dat" INTO TABLE Employee partition(employeeSalary='3000.0', country='USA')
PREHOOK: type: LOAD
#### A masked pattern was here ####
@@ -530,7 +530,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-employeeName string 0 6 5.142857142857143 6 from deserializer
+employeeName string 0 12 5.142857142857143 6 from deserializer
PREHOOK: query: alter table Employee add columns (c int ,d string)
PREHOOK: type: ALTERTABLE_ADDCOLS
PREHOOK: Input: default@employee
@@ -575,7 +575,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@employee
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-c int 2000 4000 0 4 from deserializer
+c int 2000 4000 0 3 from deserializer
PREHOOK: query: describe formatted Employee partition (employeeSalary='6000.0', country='UK') d
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@employee
diff --git a/ql/src/test/results/clientpositive/columnstats_quoting.q.out b/ql/src/test/results/clientpositive/columnstats_quoting.q.out
index 52e35385a1..7e080fec9b 100644
--- a/ql/src/test/results/clientpositive/columnstats_quoting.q.out
+++ b/ql/src/test/results/clientpositive/columnstats_quoting.q.out
@@ -30,23 +30,23 @@ STAGE PLANS:
outputColumnNames: user id, user name
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
Group By Operator
- aggregations: compute_stats(user id, 16), compute_stats(user name, 16)
+ aggregations: compute_stats(user id, 'hll'), compute_stats(user name, 'hll')
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 968 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct)
+ Statistics: Num rows: 1 Data size: 944 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
mode: mergepartial
outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 972 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -87,14 +87,14 @@ STAGE PLANS:
outputColumnNames: user id
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
Group By Operator
- aggregations: compute_stats(user id, 16)
+ aggregations: compute_stats(user id, 'hll')
mode: hash
outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 476 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 476 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct)
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0)
diff --git a/ql/src/test/results/clientpositive/columnstats_tbllvl.q.out b/ql/src/test/results/clientpositive/columnstats_tbllvl.q.out
index 462d4c1771..91c8f150a2 100644
--- a/ql/src/test/results/clientpositive/columnstats_tbllvl.q.out
+++ b/ql/src/test/results/clientpositive/columnstats_tbllvl.q.out
@@ -60,23 +60,23 @@ STAGE PLANS:
outputColumnNames: sourceip, adrevenue, avgtimeonsite
Statistics: Num rows: 65 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(avgtimeonsite, 16), compute_stats(adrevenue, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(avgtimeonsite, 'hll'), compute_stats(adrevenue, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -112,16 +112,16 @@ STAGE PLANS:
outputColumnNames: sourceip, adrevenue, avgtimeonsite
Statistics: Num rows: 65 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(avgtimeonsite, 16), compute_stats(adrevenue, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(avgtimeonsite, 'hll'), compute_stats(adrevenue, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
null sort order:
sort order:
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
tag: -1
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
auto parallelism: false
Path -> Alias:
#### A masked pattern was here ####
@@ -180,13 +180,13 @@ STAGE PLANS:
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
@@ -242,23 +242,23 @@ STAGE PLANS:
outputColumnNames: sourceip, desturl, visitdate, adrevenue, useragent, ccode, lcode, skeyword, avgtimeonsite
Statistics: Num rows: 9 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(desturl, 16), compute_stats(visitdate, 16), compute_stats(adrevenue, 16), compute_stats(useragent, 16), compute_stats(ccode, 16), compute_stats(lcode, 16), compute_stats(skeyword, 16), compute_stats(avgtimeonsite, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(desturl, 'hll'), compute_stats(visitdate, 'hll'), compute_stats(adrevenue, 'hll'), compute_stats(useragent, 'hll'), compute_stats(ccode, 'hll'), compute_stats(lcode, 'hll'), compute_stats(skeyword, 'hll'), compute_stats(avgtimeonsite, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
- Statistics: Num rows: 1 Data size: 4396 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 4288 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 4396 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct)
+ Statistics: Num rows: 1 Data size: 4288 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2), compute_stats(VALUE._col3), compute_stats(VALUE._col4), compute_stats(VALUE._col5), compute_stats(VALUE._col6), compute_stats(VALUE._col7), compute_stats(VALUE._col8)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
- Statistics: Num rows: 1 Data size: 4404 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 4320 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 4404 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 4320 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -287,7 +287,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@uservisits_web_text_none
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-destURL string 0 56 48.945454545454545 96 from deserializer
+destURL string 0 55 48.945454545454545 96 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: describe formatted UserVisits_web_text_none adRevenue
PREHOOK: type: DESCTABLE
@@ -297,7 +297,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@uservisits_web_text_none
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-adRevenue float 13.099044799804688 492.98870849609375 0 58 from deserializer
+adRevenue float 13.099044799804688 492.98870849609375 0 55 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: describe formatted UserVisits_web_text_none avgTimeOnSite
PREHOOK: type: DESCTABLE
@@ -307,7 +307,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@uservisits_web_text_none
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-avgTimeOnSite int 1 9 0 11 from deserializer
+avgTimeOnSite int 1 9 0 9 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: CREATE TABLE empty_tab(
a int,
@@ -351,23 +351,23 @@ STAGE PLANS:
outputColumnNames: a, b, c, d, e
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
Group By Operator
- aggregations: compute_stats(a, 16), compute_stats(b, 16), compute_stats(c, 16), compute_stats(d, 16), compute_stats(e, 16)
+ aggregations: compute_stats(a, 'hll'), compute_stats(b, 'hll'), compute_stats(c, 'hll'), compute_stats(d, 'hll'), compute_stats(e, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 2004 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1968 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 2004 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct)
+ Statistics: Num rows: 1 Data size: 1968 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2), compute_stats(VALUE._col3), compute_stats(VALUE._col4)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 2012 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 2000 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 2012 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 2000 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -416,7 +416,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@uservisits_web_text_none
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-destURL string 0 56 48.945454545454545 96 from deserializer
+destURL string 0 55 48.945454545454545 96 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: CREATE TABLE UserVisits_in_dummy_db (
sourceIP string,
@@ -482,23 +482,23 @@ STAGE PLANS:
outputColumnNames: sourceip, adrevenue, avgtimeonsite
Statistics: Num rows: 65 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(avgtimeonsite, 16), compute_stats(adrevenue, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(avgtimeonsite, 'hll'), compute_stats(adrevenue, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -534,16 +534,16 @@ STAGE PLANS:
outputColumnNames: sourceip, adrevenue, avgtimeonsite
Statistics: Num rows: 65 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(avgtimeonsite, 16), compute_stats(adrevenue, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(avgtimeonsite, 'hll'), compute_stats(adrevenue, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
null sort order:
sort order:
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
tag: -1
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct)
auto parallelism: false
Path -> Alias:
#### A masked pattern was here ####
@@ -602,13 +602,13 @@ STAGE PLANS:
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
- Statistics: Num rows: 1 Data size: 1452 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1440 Basic stats: COMPLETE Column stats: NONE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
@@ -664,23 +664,23 @@ STAGE PLANS:
outputColumnNames: sourceip, desturl, visitdate, adrevenue, useragent, ccode, lcode, skeyword, avgtimeonsite
Statistics: Num rows: 9 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(desturl, 16), compute_stats(visitdate, 16), compute_stats(adrevenue, 16), compute_stats(useragent, 16), compute_stats(ccode, 16), compute_stats(lcode, 16), compute_stats(skeyword, 16), compute_stats(avgtimeonsite, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(desturl, 'hll'), compute_stats(visitdate, 'hll'), compute_stats(adrevenue, 'hll'), compute_stats(useragent, 'hll'), compute_stats(ccode, 'hll'), compute_stats(lcode, 'hll'), compute_stats(skeyword, 'hll'), compute_stats(avgtimeonsite, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
- Statistics: Num rows: 1 Data size: 4396 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 4288 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 4396 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct)
+ Statistics: Num rows: 1 Data size: 4288 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2), compute_stats(VALUE._col3), compute_stats(VALUE._col4), compute_stats(VALUE._col5), compute_stats(VALUE._col6), compute_stats(VALUE._col7), compute_stats(VALUE._col8)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
- Statistics: Num rows: 1 Data size: 4404 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 4320 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 4404 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 4320 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -709,7 +709,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: dummydb@uservisits_in_dummy_db
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-destURL string 0 56 48.945454545454545 96 from deserializer
+destURL string 0 55 48.945454545454545 96 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: describe formatted dummydb.UserVisits_in_dummy_db adRevenue
PREHOOK: type: DESCTABLE
@@ -719,7 +719,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: dummydb@uservisits_in_dummy_db
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-adRevenue float 13.099044799804688 492.98870849609375 0 58 from deserializer
+adRevenue float 13.099044799804688 492.98870849609375 0 55 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: describe formatted dummydb.UserVisits_in_dummy_db avgTimeOnSite
PREHOOK: type: DESCTABLE
@@ -729,7 +729,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: dummydb@uservisits_in_dummy_db
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-avgTimeOnSite int 1 9 0 11 from deserializer
+avgTimeOnSite int 1 9 0 9 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"adrevenue\":\"true\",\"avgtimeonsite\":\"true\",\"ccode\":\"true\",\"desturl\":\"true\",\"lcode\":\"true\",\"skeyword\":\"true\",\"sourceip\":\"true\",\"useragent\":\"true\",\"visitdate\":\"true\"}}
PREHOOK: query: drop table dummydb.UserVisits_in_dummy_db
PREHOOK: type: DROPTABLE
diff --git a/ql/src/test/results/clientpositive/compute_stats_date.q.out b/ql/src/test/results/clientpositive/compute_stats_date.q.out
index c2472377a8..5cd2180108 100644
--- a/ql/src/test/results/clientpositive/compute_stats_date.q.out
+++ b/ql/src/test/results/clientpositive/compute_stats_date.q.out
@@ -35,15 +35,15 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_date
#### A masked pattern was here ####
20
-PREHOOK: query: select compute_stats(fl_date, 16) from tab_date
+PREHOOK: query: select compute_stats(fl_date, 'hll') from tab_date
PREHOOK: type: QUERY
PREHOOK: Input: default@tab_date
#### A masked pattern was here ####
-POSTHOOK: query: select compute_stats(fl_date, 16) from tab_date
+POSTHOOK: query: select compute_stats(fl_date, 'hll') from tab_date
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_date
#### A masked pattern was here ####
-{"columntype":"Date","min":"2000-11-20","max":"2010-10-29","countnulls":0,"numdistinctvalues":18,"ndvbitvector":"{0, 1, 2, 3, 4, 5}{0, 1, 2, 3}{0}{0, 1, 2, 6}{0, 1, 2, 3}{0, 1, 2, 3}{0, 1, 2}{0, 1, 2, 3}{0, 1, 2, 3}{0, 2}{0, 1, 2, 3, 4}{0, 1, 2, 4, 5}{0, 1, 2, 3}{0, 1, 2, 3, 5}{0, 1, 2, 3, 4, 5}{0, 1, 2, 3, 4}"}
+{"columntype":"Date","min":"2000-11-20","max":"2010-10-29","countnulls":0,"numdistinctvalues":19,"ndvbitvector":"SExM4BMTw6qAFv+ogCGC/7ZdgMDTH73K3+4Bgq+jE766tgWAh/xZgIqTVIDhgVDA655SwfXHA4Dy\r\n/Ve//Z0LwMSIToCZ6QOAhZ8Gg8jOEb38rBw=\r\n"}
PREHOOK: query: explain
analyze table tab_date compute statistics for columns fl_date
PREHOOK: type: QUERY
@@ -66,14 +66,14 @@ STAGE PLANS:
outputColumnNames: fl_date
Statistics: Num rows: 13 Data size: 778 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(fl_date, 16)
+ aggregations: compute_stats(fl_date, 'hll')
mode: hash
outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 572 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 560 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 572 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct)
+ Statistics: Num rows: 1 Data size: 560 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0)
@@ -111,7 +111,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@tab_date
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-fl_date date 2000-11-20 2010-10-29 0 18 from deserializer
+fl_date date 2000-11-20 2010-10-29 0 19 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"fl_date\":\"true\"}}
PREHOOK: query: alter table tab_date update statistics for column fl_date set ('numDVs'='19', 'highValue'='2015-01-01', 'lowValue'='0')
PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
diff --git a/ql/src/test/results/clientpositive/compute_stats_decimal.q.out b/ql/src/test/results/clientpositive/compute_stats_decimal.q.out
index e0584c50a8..fcfce78b82 100644
--- a/ql/src/test/results/clientpositive/compute_stats_decimal.q.out
+++ b/ql/src/test/results/clientpositive/compute_stats_decimal.q.out
@@ -23,11 +23,11 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_decimal
#### A masked pattern was here ####
19
-PREHOOK: query: select compute_stats(a, 18) from tab_decimal
+PREHOOK: query: select compute_stats(a, 'fm', 18) from tab_decimal
PREHOOK: type: QUERY
PREHOOK: Input: default@tab_decimal
#### A masked pattern was here ####
-POSTHOOK: query: select compute_stats(a, 18) from tab_decimal
+POSTHOOK: query: select compute_stats(a, 'fm', 18) from tab_decimal
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_decimal
#### A masked pattern was here ####
diff --git a/ql/src/test/results/clientpositive/compute_stats_double.q.out b/ql/src/test/results/clientpositive/compute_stats_double.q.out
index 5b921735f0..e6a087dd98 100644
--- a/ql/src/test/results/clientpositive/compute_stats_double.q.out
+++ b/ql/src/test/results/clientpositive/compute_stats_double.q.out
@@ -23,11 +23,11 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_double
#### A masked pattern was here ####
16
-PREHOOK: query: select compute_stats(a, 16) from tab_double
+PREHOOK: query: select compute_stats(a, 'fm', 16) from tab_double
PREHOOK: type: QUERY
PREHOOK: Input: default@tab_double
#### A masked pattern was here ####
-POSTHOOK: query: select compute_stats(a, 16) from tab_double
+POSTHOOK: query: select compute_stats(a, 'fm', 16) from tab_double
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_double
#### A masked pattern was here ####
diff --git a/ql/src/test/results/clientpositive/compute_stats_long.q.out b/ql/src/test/results/clientpositive/compute_stats_long.q.out
index 119d1731cc..fb985d8266 100644
--- a/ql/src/test/results/clientpositive/compute_stats_long.q.out
+++ b/ql/src/test/results/clientpositive/compute_stats_long.q.out
@@ -23,11 +23,11 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_int
#### A masked pattern was here ####
12
-PREHOOK: query: select compute_stats(a, 16) from tab_int
+PREHOOK: query: select compute_stats(a, 'fm', 16) from tab_int
PREHOOK: type: QUERY
PREHOOK: Input: default@tab_int
#### A masked pattern was here ####
-POSTHOOK: query: select compute_stats(a, 16) from tab_int
+POSTHOOK: query: select compute_stats(a, 'fm', 16) from tab_int
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_int
#### A masked pattern was here ####
diff --git a/ql/src/test/results/clientpositive/compute_stats_string.q.out b/ql/src/test/results/clientpositive/compute_stats_string.q.out
index 8c40490bc0..a5d66eba31 100644
--- a/ql/src/test/results/clientpositive/compute_stats_string.q.out
+++ b/ql/src/test/results/clientpositive/compute_stats_string.q.out
@@ -23,11 +23,11 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_string
#### A masked pattern was here ####
10
-PREHOOK: query: select compute_stats(a, 16) from tab_string
+PREHOOK: query: select compute_stats(a, 'fm', 16) from tab_string
PREHOOK: type: QUERY
PREHOOK: Input: default@tab_string
#### A masked pattern was here ####
-POSTHOOK: query: select compute_stats(a, 16) from tab_string
+POSTHOOK: query: select compute_stats(a, 'fm', 16) from tab_string
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tab_string
#### A masked pattern was here ####
diff --git a/ql/src/test/results/clientpositive/confirm_initial_tbl_stats.q.out b/ql/src/test/results/clientpositive/confirm_initial_tbl_stats.q.out
index faa14ba9c5..5593e422b6 100644
--- a/ql/src/test/results/clientpositive/confirm_initial_tbl_stats.q.out
+++ b/ql/src/test/results/clientpositive/confirm_initial_tbl_stats.q.out
@@ -16,7 +16,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 205 2.812 3 from deserializer
+key string 0 309 2.812 3 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
PREHOOK: query: describe extended src1
PREHOOK: type: DESCTABLE
@@ -36,7 +36,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value string 0 14 4.92 7 from deserializer
+value string 0 19 4.92 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
PREHOOK: query: describe extended src_json
PREHOOK: type: DESCTABLE
@@ -75,7 +75,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@src_sequencefile
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value string 0 214 6.812 7 from deserializer
+value string 0 309 6.812 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
PREHOOK: query: describe extended srcbucket
PREHOOK: type: DESCTABLE
@@ -95,7 +95,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@srcbucket
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value string 0 234 6.802 7 from deserializer
+value string 0 430 6.802 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
PREHOOK: query: describe extended srcbucket2
PREHOOK: type: DESCTABLE
@@ -115,7 +115,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@srcbucket2
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-value string 0 214 6.812 7 from deserializer
+value string 0 309 6.812 7 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
PREHOOK: query: describe extended srcpart
PREHOOK: type: DESCTABLE
@@ -143,7 +143,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@srcpart
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 205 2.812 3 from deserializer
+key string 0 309 2.812 3 from deserializer
PREHOOK: query: describe extended alltypesorc
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@alltypesorc
@@ -172,7 +172,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@alltypesorc
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-ctinyint tinyint -64 62 3115 94 from deserializer
+ctinyint tinyint -64 62 3115 127 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"cbigint\":\"true\",\"cboolean1\":\"true\",\"cboolean2\":\"true\",\"cdouble\":\"true\",\"cfloat\":\"true\",\"cint\":\"true\",\"csmallint\":\"true\",\"cstring1\":\"true\",\"cstring2\":\"true\",\"ctimestamp1\":\"true\",\"ctimestamp2\":\"true\",\"ctinyint\":\"true\"}}
PREHOOK: query: describe formatted alltypesorc cfloat
PREHOOK: type: DESCTABLE
@@ -182,7 +182,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@alltypesorc
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-cfloat float -64.0 79.5530014038086 3115 117 from deserializer
+cfloat float -64.0 79.5530014038086 3115 131 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"cbigint\":\"true\",\"cboolean1\":\"true\",\"cboolean2\":\"true\",\"cdouble\":\"true\",\"cfloat\":\"true\",\"cint\":\"true\",\"csmallint\":\"true\",\"cstring1\":\"true\",\"cstring2\":\"true\",\"ctimestamp1\":\"true\",\"ctimestamp2\":\"true\",\"ctinyint\":\"true\"}}
PREHOOK: query: describe formatted alltypesorc ctimestamp1
PREHOOK: type: DESCTABLE
@@ -192,7 +192,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@alltypesorc
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-ctimestamp1 timestamp -30 31 3115 31 from deserializer
+ctimestamp1 timestamp -30 31 3115 35 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"cbigint\":\"true\",\"cboolean1\":\"true\",\"cboolean2\":\"true\",\"cdouble\":\"true\",\"cfloat\":\"true\",\"cint\":\"true\",\"csmallint\":\"true\",\"cstring1\":\"true\",\"cstring2\":\"true\",\"ctimestamp1\":\"true\",\"ctimestamp2\":\"true\",\"ctinyint\":\"true\"}}
PREHOOK: query: describe formatted alltypesorc cboolean2
PREHOOK: type: DESCTABLE
diff --git a/ql/src/test/results/clientpositive/constant_prop_2.q.out b/ql/src/test/results/clientpositive/constant_prop_2.q.out
index 24be5188e2..93050417c6 100644
--- a/ql/src/test/results/clientpositive/constant_prop_2.q.out
+++ b/ql/src/test/results/clientpositive/constant_prop_2.q.out
@@ -43,7 +43,7 @@ STAGE PLANS:
outputColumnNames: key, value
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(key, 16), compute_stats(value, 16)
+ aggregations: compute_stats(key, 'hll'), compute_stats(value, 'hll')
keys: '2008-04-08' (type: string), '11' (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
@@ -53,7 +53,7 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: '2008-04-08' (type: string), '11' (type: string)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col2 (type: struct), _col3 (type: struct)
+ value expressions: _col2 (type: struct), _col3 (type: struct)
Reduce Operator Tree:
Group By Operator
aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1)
diff --git a/ql/src/test/results/clientpositive/correlated_join_keys.q.out b/ql/src/test/results/clientpositive/correlated_join_keys.q.out
index ec5d008728..b81a5611b1 100644
--- a/ql/src/test/results/clientpositive/correlated_join_keys.q.out
+++ b/ql/src/test/results/clientpositive/correlated_join_keys.q.out
@@ -207,7 +207,7 @@ STAGE PLANS:
keys:
0 _col0 (type: string), _col1 (type: string)
1 _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 16 Data size: 128 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 20 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
diff --git a/ql/src/test/results/clientpositive/cross_join_merge.q.out b/ql/src/test/results/clientpositive/cross_join_merge.q.out
index f4956ded22..7241dfe45f 100644
--- a/ql/src/test/results/clientpositive/cross_join_merge.q.out
+++ b/ql/src/test/results/clientpositive/cross_join_merge.q.out
@@ -233,7 +233,7 @@ STAGE PLANS:
Processor Tree:
ListSink
-Warning: Shuffle Join JOIN[14][tables = [$hdt$_0, $hdt$_1, $hdt$_2]] in Stage 'Stage-2:MAPRED' is a cross product
+Warning: Shuffle Join JOIN[11][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product
PREHOOK: query: explain
select src1.key from src src1 join src src2 on 5 = src2.key join src src3 on src1.key=src3.key
PREHOOK: type: QUERY
@@ -250,7 +250,7 @@ STAGE PLANS:
Map Reduce
Map Operator Tree:
TableScan
- alias: src1
+ alias: src3
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
@@ -260,34 +260,29 @@ STAGE PLANS:
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
+ sort order:
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string)
TableScan
- alias: src3
+ alias: src2
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: key is not null (type: boolean)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ predicate: (5.0 = UDFToDouble(key)) (type: boolean)
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: key (type: string)
- outputColumnNames: _col0
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ sort order:
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
Reduce Operator Tree:
Join Operator
condition map:
Inner Join 0 to 1
keys:
- 0 _col0 (type: string)
- 1 _col0 (type: string)
+ 0
+ 1
outputColumnNames: _col0
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 125000 Data size: 2781000 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
@@ -300,36 +295,45 @@ STAGE PLANS:
Map Operator Tree:
TableScan
Reduce Output Operator
- sort order:
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: string)
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 125000 Data size: 2781000 Basic stats: COMPLETE Column stats: NONE
TableScan
- alias: src2
+ alias: src1
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: (5.0 = UDFToDouble(key)) (type: boolean)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Select Operator
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ expressions: key (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- sort order:
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Reduce Operator Tree:
Join Operator
condition map:
Inner Join 0 to 1
keys:
- 0
- 1
- outputColumnNames: _col0
- Statistics: Num rows: 137500 Data size: 3059050 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 137500 Data size: 3059050 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col2
+ Statistics: Num rows: 137500 Data size: 3059100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col2 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 137500 Data size: 3059100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 137500 Data size: 3059100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
diff --git a/ql/src/test/results/clientpositive/decimal_stats.q.out b/ql/src/test/results/clientpositive/decimal_stats.q.out
index 5d86866e2a..f58a7cc8e1 100644
--- a/ql/src/test/results/clientpositive/decimal_stats.q.out
+++ b/ql/src/test/results/clientpositive/decimal_stats.q.out
@@ -48,7 +48,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@decimal_1
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-v decimal(10,0) 500 1 from deserializer
+v decimal(10,0) 500 0 from deserializer
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"t\":\"true\",\"u\":\"true\",\"v\":\"true\"}}
PREHOOK: query: explain select * from decimal_1 order by t limit 100
PREHOOK: type: QUERY
diff --git a/ql/src/test/results/clientpositive/describe_table.q.out b/ql/src/test/results/clientpositive/describe_table.q.out
index 7869494252..3ba9a7b942 100644
--- a/ql/src/test/results/clientpositive/describe_table.q.out
+++ b/ql/src/test/results/clientpositive/describe_table.q.out
@@ -212,7 +212,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@srcpart
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 205 2.812 3 from deserializer
+key string 0 309 2.812 3 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"key\":\"true\"}}
PREHOOK: query: describe formatted srcpart PARTITION(ds='2008-04-08', hr='12')
PREHOOK: type: DESCTABLE
@@ -304,7 +304,7 @@ POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@srcpart
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
-key string 0 205 2.812 3 from deserializer
+key string 0 309 2.812 3 from deserializer
COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"key\":\"true\"}}
PREHOOK: query: describe formatted `srcpart` PARTITION(ds='2008-04-08', hr='12')
PREHOOK: type: DESCTABLE
diff --git a/ql/src/test/results/clientpositive/display_colstats_tbllvl.q.out b/ql/src/test/results/clientpositive/display_colstats_tbllvl.q.out
index a4b18d7cec..73d4cd7660 100644
--- a/ql/src/test/results/clientpositive/display_colstats_tbllvl.q.out
+++ b/ql/src/test/results/clientpositive/display_colstats_tbllvl.q.out
@@ -76,23 +76,23 @@ STAGE PLANS:
outputColumnNames: sourceip, adrevenue, avgtimeonsite
Statistics: Num rows: 65 Data size: 7060 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: compute_stats(sourceip, 16), compute_stats(avgtimeonsite, 16), compute_stats(adrevenue, 16)
+ aggregations: compute_stats(sourceip, 'hll'), compute_stats(avgtimeonsite, 'hll'), compute_stats(adrevenue, 'hll')
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 1408 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
sort order:
- Statistics: Num rows: 1 Data size: 1444 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: struct