diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java b/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java index e810ac5487..457d2f5059 100644 --- a/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java @@ -19,6 +19,14 @@ package org.apache.hadoop.hive.common.ndv; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +import org.apache.commons.codec.binary.Base64; +import org.apache.hadoop.hive.common.ndv.fm.FMSketch; +import org.apache.hadoop.hive.common.ndv.fm.FMSketchUtils; import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; public class NumDistinctValueEstimatorFactory { @@ -26,11 +34,24 @@ private NumDistinctValueEstimatorFactory() { } + private static boolean isFMSketch(String s) throws IOException { + InputStream in = new ByteArrayInputStream(Base64.decodeBase64(s)); + byte[] magic = new byte[2]; + magic[0] = (byte) in.read(); + magic[1] = (byte) in.read(); + return Arrays.equals(magic, FMSketchUtils.MAGIC); + } + public static NumDistinctValueEstimator getNumDistinctValueEstimator(String s) { - if (s.startsWith("{")) { - return new FMSketch(s); - } else { - return HyperLogLog.builder().build().deserialize(s); + // Right now we assume only FM and HLL are available. + try { + if (isFMSketch(s)) { + return FMSketchUtils.deserializeFM(s); + } else { + return HyperLogLog.builder().build().deserialize(s); + } + } catch (IOException e) { + throw new RuntimeException(e); } } diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java b/common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java similarity index 81% rename from common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java rename to common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java index e20d29954a..7f48a8aaaa 100644 --- a/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java @@ -15,22 +15,36 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.common.ndv; - +package org.apache.hadoop.hive.common.ndv.fm; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Arrays; import java.util.Random; +import javolution.text.Text; import javolution.util.FastBitSet; +import javolution.util.FastCollection.Record; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLogUtils; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.EncodingType; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.util.JavaDataModel; public class FMSketch implements NumDistinctValueEstimator{ static final Logger LOG = LoggerFactory.getLogger(FMSketch.class.getName()); + public static final byte[] MAGIC = new byte[] { 'F', 'M' }; /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number. * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1. @@ -38,7 +52,7 @@ * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1 * thus introducing errors in the estimates. */ - private static final int BIT_VECTOR_SIZE = 31; + public static final int BIT_VECTOR_SIZE = 31; // Refer to Flajolet-Martin'86 for the value of phi private static final double PHI = 0.77351; @@ -111,27 +125,6 @@ public FMSketch(int numBitVectors) { } } - public FMSketch(String s, int numBitVectors) { - this.numBitVectors = numBitVectors; - FastBitSet bitVectorDeser[] = genBitSet(s, numBitVectors); - bitVector = new FastBitSet[numBitVectors]; - for(int i=0; i = '0' && c <= '9') { - String t = new String(); - t = t + c; - c = s.charAt(i); - i = i + 1; - - while (c != ',' && c!= '}') { - t = t + c; - c = s.charAt(i); - i = i + 1; - } - - int bitIndex = Integer.parseInt(t); - assert(bitIndex >= 0); - assert(vectorIndex < numBitVectors); - b[vectorIndex].set(bitIndex); - if (c == '}') { - vectorIndex = vectorIndex + 1; - } - } + + @Override + public NumDistinctValueEstimator deserialize(String s) { + InputStream is = new ByteArrayInputStream(Base64.decodeBase64(s)); + try { + return FMSketchUtils.deserializeFM(is); + } catch (IOException e) { + throw new RuntimeException(e); } - return b; } - + private int generateHash(long v, int hashNum) { int mod = (1<4 byte header is encoded like below 2 bytes - FM magic string to + * identify serialized stream 2 bytes - numbitvectors because + * BIT_VECTOR_SIZE=31, 4 bytes are enough to hold positions of 0-31 + */ + public static void serializeFM(OutputStream out, FMSketch fm) throws IOException { + out.write(MAGIC); + + // max of numBitVectors = 1024, 2 bytes is enough. + byte[] nbv = new byte[2]; + nbv[0] = (byte) fm.getnumBitVectors(); + nbv[1] = (byte) (fm.getnumBitVectors() >>> 8); + + out.write(nbv); + + // original toString takes too much space + // we compress a fastbitset to 4 bytes + for (int i = 0; i < fm.getnumBitVectors(); i++) { + writeBitVector(out, fm.getBitVector(i)); + } + } + + private static void writeBitVector(OutputStream out, FastBitSet bit) throws IOException { + int num = 0; + for (int pos = 0; pos < FMSketch.BIT_VECTOR_SIZE; pos++) { + if (bit.get(pos)) { + num |= 1 << pos; + } + } + byte[] i = new byte[4]; + for (int j = 0; j < 4; j++) { + i[j] = (byte) ((num >>> (8 * j)) & 0xff); + } + out.write(i); + } + + /* + * Deserializes from string to FastBitSet; Creates a NumDistinctValueEstimator + * object and returns it. + */ + public static FMSketch deserializeFM(String s) throws IOException { + InputStream is = new ByteArrayInputStream(Base64.decodeBase64(s)); + try { + return deserializeFM(is); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static FMSketch deserializeFM(InputStream in) throws IOException { + checkMagicString(in); + + byte[] nbv = new byte[2]; + nbv[0] = (byte) in.read(); + nbv[1] = (byte) in.read(); + + int numBitVectors = 0; + numBitVectors |= (nbv[0] & 0xff); + numBitVectors |= ((nbv[1] & 0xff) << 8); + + FMSketch sketch = new FMSketch(numBitVectors); + for (int n = 0; n < numBitVectors; n++) { + sketch.setBitVector(readBitVector(in), n); + } + return sketch; + } + + private static FastBitSet readBitVector(InputStream in) throws IOException { + FastBitSet fastBitSet = new FastBitSet(); + fastBitSet.clear(); + for (int i = 0; i < 4; i++) { + byte b = (byte) in.read(); + for (int j = 0; j < 8; j++) { + if ((b & (1 << j)) != 0) { + fastBitSet.set(j + 8 * i); + } + } + } + return fastBitSet; + } + + private static void checkMagicString(InputStream in) throws IOException { + byte[] magic = new byte[2]; + magic[0] = (byte) in.read(); + magic[1] = (byte) in.read(); + + if (!Arrays.equals(magic, MAGIC)) { + throw new IllegalArgumentException("The input stream is not a FMSketch stream."); + } + } +} diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java b/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java index d1955468a6..a700e846fc 100644 --- a/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java @@ -277,7 +277,9 @@ public void add(long hashcode) { } public long estimateNumDistinctValues() { - return count(); + // FMSketch treats the ndv of all nulls as 1 but hll treates the ndv as 0. + // In order to get rid of divide by 1 problem, we follow FMSketch + return count() > 0 ? count() : 1; } public long count() { diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java b/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java new file mode 100644 index 0000000000..74fdf58d2d --- /dev/null +++ b/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java @@ -0,0 +1,97 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.ndv.fm; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; + +import javolution.util.FastBitSet; + +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.junit.Test; + +public class TestFMSketchSerialization { + + private FastBitSet[] deserialize(String s, int numBitVectors) { + FastBitSet[] b = new FastBitSet[numBitVectors]; + for (int j = 0; j < numBitVectors; j++) { + b[j] = new FastBitSet(FMSketch.BIT_VECTOR_SIZE); + b[j].clear(); + } + + int vectorIndex = 0; + + /* + * Parse input string to obtain the indexes that are set in the bitvector. + * When a toString() is called on a FastBitSet object to serialize it, the + * serialization adds { and } to the beginning and end of the return String. + * Skip "{", "}", ",", " " in the input string. + */ + for (int i = 1; i < s.length() - 1;) { + char c = s.charAt(i); + i = i + 1; + + // Move on to the next bit vector + if (c == '}') { + vectorIndex = vectorIndex + 1; + } + + // Encountered a numeric value; Extract out the entire number + if (c >= '0' && c <= '9') { + String t = new String(); + t = t + c; + c = s.charAt(i); + i = i + 1; + + while (c != ',' && c != '}') { + t = t + c; + c = s.charAt(i); + i = i + 1; + } + + int bitIndex = Integer.parseInt(t); + assert (bitIndex >= 0); + assert (vectorIndex < numBitVectors); + b[vectorIndex].set(bitIndex); + if (c == '}') { + vectorIndex = vectorIndex + 1; + } + } + } + return b; + } + + @Test + public void testSerDe() throws IOException { + String bitVectors = "{0, 4, 5, 7}{0, 1}{0, 1, 2}{0, 1, 4}{0}{0, 2}{0, 3}{0, 2, 3, 4}{0, 1, 4}{0, 1}{0}{0, 1, 3, 8}{0, 2}{0, 2}{0, 9}{0, 1, 4}"; + FastBitSet[] fastBitSet = deserialize(bitVectors, 16); + FMSketch sketch = new FMSketch(16); + for (int i = 0; i < 16; i++) { + sketch.setBitVector(fastBitSet[i], i); + } + assertEquals(sketch.estimateNumDistinctValues(), 3); + String s = sketch.serialize(); + FMSketch newSketch = (FMSketch) NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(s); + sketch.equals(newSketch); + assertEquals(newSketch.estimateNumDistinctValues(), 3); + assertEquals(newSketch.serialize(), s); + } + +} \ No newline at end of file diff --git a/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql b/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql index a9a532906f..edd9decd90 100644 --- a/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql +++ b/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql @@ -94,7 +94,7 @@ CREATE TABLE "APP"."MASTER_KEYS" ("KEY_ID" INTEGER NOT NULL generated always as CREATE TABLE "APP"."DELEGATION_TOKENS" ( "TOKEN_IDENT" VARCHAR(767) NOT NULL, "TOKEN" VARCHAR(767)); -CREATE TABLE "APP"."PART_COL_STATS"("DB_NAME" VARCHAR(128) NOT NULL,"TABLE_NAME" VARCHAR(256) NOT NULL, "PARTITION_NAME" VARCHAR(767) NOT NULL, "COLUMN_NAME" VARCHAR(767) NOT NULL, "COLUMN_TYPE" VARCHAR(128) NOT NULL, "LONG_LOW_VALUE" BIGINT, "LONG_HIGH_VALUE" BIGINT, "DOUBLE_LOW_VALUE" DOUBLE, "DOUBLE_HIGH_VALUE" DOUBLE, "BIG_DECIMAL_LOW_VALUE" VARCHAR(4000), "BIG_DECIMAL_HIGH_VALUE" VARCHAR(4000),"NUM_DISTINCTS" BIGINT, "NUM_NULLS" BIGINT NOT NULL, "AVG_COL_LEN" DOUBLE, "MAX_COL_LEN" BIGINT, "NUM_TRUES" BIGINT, "NUM_FALSES" BIGINT, "LAST_ANALYZED" BIGINT, "CS_ID" BIGINT NOT NULL, "PART_ID" BIGINT NOT NULL); +CREATE TABLE "APP"."PART_COL_STATS"("DB_NAME" VARCHAR(128) NOT NULL,"TABLE_NAME" VARCHAR(256) NOT NULL, "PARTITION_NAME" VARCHAR(767) NOT NULL, "COLUMN_NAME" VARCHAR(767) NOT NULL, "COLUMN_TYPE" VARCHAR(128) NOT NULL, "LONG_LOW_VALUE" BIGINT, "LONG_HIGH_VALUE" BIGINT, "DOUBLE_LOW_VALUE" DOUBLE, "DOUBLE_HIGH_VALUE" DOUBLE, "BIG_DECIMAL_LOW_VALUE" VARCHAR(4000), "BIG_DECIMAL_HIGH_VALUE" VARCHAR(4000),"NUM_DISTINCTS" BIGINT, "BIT_VECTOR" VARCHAR(16400), "NUM_NULLS" BIGINT NOT NULL, "AVG_COL_LEN" DOUBLE, "MAX_COL_LEN" BIGINT, "NUM_TRUES" BIGINT, "NUM_FALSES" BIGINT, "LAST_ANALYZED" BIGINT, "CS_ID" BIGINT NOT NULL, "PART_ID" BIGINT NOT NULL); CREATE TABLE "APP"."VERSION" ("VER_ID" BIGINT NOT NULL, "SCHEMA_VERSION" VARCHAR(127) NOT NULL, "VERSION_COMMENT" VARCHAR(255)); diff --git a/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql b/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql index 97d881f263..89991efdde 100644 --- a/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql +++ b/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql @@ -690,6 +690,7 @@ CREATE TABLE IF NOT EXISTS `PART_COL_STATS` ( `BIG_DECIMAL_HIGH_VALUE` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin, `NUM_NULLS` bigint(20) NOT NULL, `NUM_DISTINCTS` bigint(20), + `BIT_VECTOR` varchar(16400) CHARACTER SET latin1 COLLATE latin1_bin, `AVG_COL_LEN` double(53,4), `MAX_COL_LEN` bigint(20), `NUM_TRUES` bigint(20), diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java deleted file mode 100644 index d0569fb8d8..0000000000 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.metastore; - -import java.util.HashMap; -import java.util.Map; - -public interface IExtrapolatePartStatus { - /** - * The sequence of colStatNames. - */ - static String[] colStatNames = new String[] { "LONG_LOW_VALUE", "LONG_HIGH_VALUE", - "DOUBLE_LOW_VALUE", "DOUBLE_HIGH_VALUE", "BIG_DECIMAL_LOW_VALUE", "BIG_DECIMAL_HIGH_VALUE", - "NUM_NULLS", "NUM_DISTINCTS", "AVG_COL_LEN", "MAX_COL_LEN", "NUM_TRUES", "NUM_FALSES", - "AVG_NDV_LONG", "AVG_NDV_DOUBLE", "AVG_NDV_DECIMAL", "SUM_NUM_DISTINCTS" }; - - /** - * The indexes for colstats. - */ - static HashMap indexMaps = new HashMap() { - { - put("bigint", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("int", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("smallint", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("tinyint", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("date", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("timestamp", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("long", new Integer[] { 0, 1, 6, 7, 12, 15 }); - put("double", new Integer[] { 2, 3, 6, 7, 13, 15 }); - put("float", new Integer[] { 2, 3, 6, 7, 13, 15 }); - put("varchar", new Integer[] { 8, 9, 6, 7, 15 }); - put("char", new Integer[] { 8, 9, 6, 7, 15 }); - put("string", new Integer[] { 8, 9, 6, 7, 15 }); - put("boolean", new Integer[] { 10, 11, 6, 15 }); - put("binary", new Integer[] { 8, 9, 6, 15 }); - put("decimal", new Integer[] { 4, 5, 6, 7, 14, 15 }); - put("default", new Integer[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15 }); - } - }; - - /** - * The sequence of colStatTypes. - */ - static enum ColStatType { - Long, Double, Decimal - } - - static ColStatType[] colStatTypes = new ColStatType[] { ColStatType.Long, ColStatType.Long, - ColStatType.Double, ColStatType.Double, ColStatType.Decimal, ColStatType.Decimal, - ColStatType.Long, ColStatType.Long, ColStatType.Double, ColStatType.Long, ColStatType.Long, - ColStatType.Long, ColStatType.Double, ColStatType.Double, ColStatType.Double, - ColStatType.Long }; - - /** - * The sequence of aggregation function on colStats. - */ - static enum AggrType { - Min, Max, Sum, Avg - } - - static AggrType[] aggrTypes = new AggrType[] { AggrType.Min, AggrType.Max, AggrType.Min, - AggrType.Max, AggrType.Min, AggrType.Max, AggrType.Sum, AggrType.Max, AggrType.Max, - AggrType.Max, AggrType.Sum, AggrType.Sum, AggrType.Avg, AggrType.Avg, AggrType.Avg, - AggrType.Sum }; - - public Object extrapolate(Object[] min, Object[] max, int colStatIndex, - Map indexMap); - -} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java deleted file mode 100644 index f4e5ef7045..0000000000 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.metastore; - -import java.math.BigDecimal; -import java.util.Map; - -public class LinearExtrapolatePartStatus implements IExtrapolatePartStatus { - - @Override - public Object extrapolate(Object[] min, Object[] max, int colStatIndex, - Map indexMap) { - int rightBorderInd = indexMap.size() - 1; - int minInd = indexMap.get((String) min[1]); - int maxInd = indexMap.get((String) max[1]); - if (minInd == maxInd) { - return min[0]; - } - //note that recent metastore stores decimal in string. - double decimalmin= 0; - double decimalmax = 0; - if (colStatTypes[colStatIndex] == ColStatType.Decimal) { - BigDecimal bdmin = new BigDecimal(min[0].toString()); - decimalmin = bdmin.doubleValue(); - BigDecimal bdmax = new BigDecimal(max[0].toString()); - decimalmax = bdmax.doubleValue(); - } - if (aggrTypes[colStatIndex] == AggrType.Max) { - if (minInd < maxInd) { - // right border is the max - if (colStatTypes[colStatIndex] == ColStatType.Long) { - return (Long) ((Long) min[0] + (((Long) max[0] - (Long) min[0]) - * (rightBorderInd - minInd) / (maxInd - minInd))); - } else if (colStatTypes[colStatIndex] == ColStatType.Double) { - return (Double) ((Double) min[0] + (((Double) max[0] - (Double) min[0]) - * (rightBorderInd - minInd) / (maxInd - minInd))); - } else { - double ret = decimalmin + (decimalmax - decimalmin) - * (rightBorderInd - minInd) / (maxInd - minInd); - return String.valueOf(ret); - } - } else { - // left border is the max - if (colStatTypes[colStatIndex] == ColStatType.Long) { - return (Long) ((Long) min[0] + ((Long) max[0] - (Long) min[0]) - * minInd / (minInd - maxInd)); - } else if (colStatTypes[colStatIndex] == ColStatType.Double) { - return (Double) ((Double) min[0] + ((Double) max[0] - (Double) min[0]) - * minInd / (minInd - maxInd)); - } else { - double ret = decimalmin + (decimalmax - decimalmin) * minInd - / (minInd - maxInd); - return String.valueOf(ret); - } - } - } else { - if (minInd < maxInd) { - // left border is the min - if (colStatTypes[colStatIndex] == ColStatType.Long) { - Long ret = (Long) max[0] - ((Long) max[0] - (Long) min[0]) * maxInd - / (maxInd - minInd); - return ret; - } else if (colStatTypes[colStatIndex] == ColStatType.Double) { - Double ret = (Double) max[0] - ((Double) max[0] - (Double) min[0]) - * maxInd / (maxInd - minInd); - return ret; - } else { - double ret = decimalmax - (decimalmax - decimalmin) * maxInd - / (maxInd - minInd); - return String.valueOf(ret); - } - } else { - // right border is the min - if (colStatTypes[colStatIndex] == ColStatType.Long) { - Long ret = (Long) max[0] - ((Long) max[0] - (Long) min[0]) - * (rightBorderInd - maxInd) / (minInd - maxInd); - return ret; - } else if (colStatTypes[colStatIndex] == ColStatType.Double) { - Double ret = (Double) max[0] - ((Double) max[0] - (Double) min[0]) - * (rightBorderInd - maxInd) / (minInd - maxInd); - return ret; - } else { - double ret = decimalmax - (decimalmax - decimalmin) - * (rightBorderInd - maxInd) / (minInd - maxInd); - return String.valueOf(ret); - } - } - } - } -} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index a960b2d26b..07f3cffa93 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -33,6 +33,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.TreeMap; import javax.jdo.PersistenceManager; @@ -64,6 +65,8 @@ import org.apache.hadoop.hive.metastore.api.SkewedInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregatorFactory; import org.apache.hadoop.hive.metastore.model.MConstraint; import org.apache.hadoop.hive.metastore.model.MDatabase; import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; @@ -1447,291 +1450,45 @@ private long partsFoundForPartitions(final String dbName, final String tableName private List columnStatisticsObjForPartitionsBatch(String dbName, String tableName, List partNames, List colNames, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { - // TODO: all the extrapolation logic should be moved out of this class, - // only mechanical data retrieval should remain here. - String commonPrefix = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", " - + "min(\"LONG_LOW_VALUE\"), max(\"LONG_HIGH_VALUE\"), min(\"DOUBLE_LOW_VALUE\"), max(\"DOUBLE_HIGH_VALUE\"), " - + "min(cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal)), max(cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)), " - + "sum(\"NUM_NULLS\"), max(\"NUM_DISTINCTS\"), " - + "max(\"AVG_COL_LEN\"), max(\"MAX_COL_LEN\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), " - // The following data is used to compute a partitioned table's NDV based - // on partitions' NDV when useDensityFunctionForNDVEstimation = true. Global NDVs cannot be - // accurately derived from partition NDVs, because the domain of column value two partitions - // can overlap. If there is no overlap then global NDV is just the sum - // of partition NDVs (UpperBound). But if there is some overlay then - // global NDV can be anywhere between sum of partition NDVs (no overlap) - // and same as one of the partition NDV (domain of column value in all other - // partitions is subset of the domain value in one of the partition) - // (LowerBound).But under uniform distribution, we can roughly estimate the global - // NDV by leveraging the min/max values. - // And, we also guarantee that the estimation makes sense by comparing it to the - // UpperBound (calculated by "sum(\"NUM_DISTINCTS\")") - // and LowerBound (calculated by "max(\"NUM_DISTINCTS\")") - + "avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as decimal))," - + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," - + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")," - + "sum(\"NUM_DISTINCTS\")" + " from " + PART_COL_STATS + "" - + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? "; - String queryText = null; - long start = 0; - long end = 0; - Query query = null; - boolean doTrace = LOG.isDebugEnabled(); - Object qResult = null; - ForwardQueryResult fqr = null; - // Check if the status of all the columns of all the partitions exists - // Extrapolation is not needed. - if (areAllPartsFound) { - queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" - + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; - start = doTrace ? System.nanoTime() : 0; - query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, prepareParams(dbName, tableName, partNames, colNames), - queryText); - if (qResult == null) { - query.closeAll(); - return Collections.emptyList(); - } - end = doTrace ? System.nanoTime() : 0; - timingTrace(doTrace, queryText, start, end); - List list = ensureList(qResult); - List colStats = new ArrayList(list.size()); - for (Object[] row : list) { - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner)); - Deadline.checkTimeout(); - } - query.closeAll(); - return colStats; - } else { - // Extrapolation is needed for some columns. - // In this case, at least a column status for a partition is missing. - // We need to extrapolate this partition based on the other partitions - List colStats = new ArrayList(colNames.size()); - queryText = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", count(\"PARTITION_NAME\") " - + " from " + PART_COL_STATS - + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " - + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" - + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; - start = doTrace ? System.nanoTime() : 0; - query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, prepareParams(dbName, tableName, partNames, colNames), - queryText); - end = doTrace ? System.nanoTime() : 0; - timingTrace(doTrace, queryText, start, end); - if (qResult == null) { - query.closeAll(); - return Collections.emptyList(); - } - List noExtraColumnNames = new ArrayList(); - Map extraColumnNameTypeParts = new HashMap(); - List list = ensureList(qResult); - for (Object[] row : list) { - String colName = (String) row[0]; - String colType = (String) row[1]; - // Extrapolation is not needed for this column if - // count(\"PARTITION_NAME\")==partNames.size() - // Or, extrapolation is not possible for this column if - // count(\"PARTITION_NAME\")<2 - Long count = extractSqlLong(row[2]); - if (count == partNames.size() || count < 2) { - noExtraColumnNames.add(colName); - } else { - extraColumnNameTypeParts.put(colName, new String[] { colType, String.valueOf(count) }); - } - Deadline.checkTimeout(); - } - query.closeAll(); - // Extrapolation is not needed for columns noExtraColumnNames - if (noExtraColumnNames.size() != 0) { - queryText = commonPrefix + " and \"COLUMN_NAME\" in (" - + makeParams(noExtraColumnNames.size()) + ")" + " and \"PARTITION_NAME\" in (" - + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; - start = doTrace ? System.nanoTime() : 0; - query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, - prepareParams(dbName, tableName, partNames, noExtraColumnNames), queryText); - if (qResult == null) { - query.closeAll(); - return Collections.emptyList(); - } - list = ensureList(qResult); - for (Object[] row : list) { - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner)); - Deadline.checkTimeout(); - } - end = doTrace ? System.nanoTime() : 0; - timingTrace(doTrace, queryText, start, end); - query.closeAll(); - } - // Extrapolation is needed for extraColumnNames. - // give a sequence number for all the partitions - if (extraColumnNameTypeParts.size() != 0) { - Map indexMap = new HashMap(); - for (int index = 0; index < partNames.size(); index++) { - indexMap.put(partNames.get(index), index); - } - // get sum for all columns to reduce the number of queries - Map> sumMap = new HashMap>(); - queryText = "select \"COLUMN_NAME\", sum(\"NUM_NULLS\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), sum(\"NUM_DISTINCTS\")" - + " from " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " - + " and \"COLUMN_NAME\" in (" + makeParams(extraColumnNameTypeParts.size()) - + ") and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) - + ") group by \"COLUMN_NAME\""; - start = doTrace ? System.nanoTime() : 0; - query = pm.newQuery("javax.jdo.query.SQL", queryText); - List extraColumnNames = new ArrayList(); - extraColumnNames.addAll(extraColumnNameTypeParts.keySet()); - qResult = executeWithArray(query, - prepareParams(dbName, tableName, partNames, extraColumnNames), queryText); - if (qResult == null) { - query.closeAll(); - return Collections.emptyList(); - } - list = ensureList(qResult); - // see the indexes for colstats in IExtrapolatePartStatus - Integer[] sumIndex = new Integer[] { 6, 10, 11, 15 }; - for (Object[] row : list) { - Map indexToObject = new HashMap(); - for (int ind = 1; ind < row.length; ind++) { - indexToObject.put(sumIndex[ind - 1], row[ind]); - } - // row[0] is the column name - sumMap.put((String) row[0], indexToObject); - Deadline.checkTimeout(); - } - end = doTrace ? System.nanoTime() : 0; - timingTrace(doTrace, queryText, start, end); - query.closeAll(); - for (Map.Entry entry : extraColumnNameTypeParts.entrySet()) { - Object[] row = new Object[IExtrapolatePartStatus.colStatNames.length + 2]; - String colName = entry.getKey(); - String colType = entry.getValue()[0]; - Long sumVal = Long.parseLong(entry.getValue()[1]); - // fill in colname - row[0] = colName; - // fill in coltype - row[1] = colType; - // use linear extrapolation. more complicated one can be added in the - // future. - IExtrapolatePartStatus extrapolateMethod = new LinearExtrapolatePartStatus(); - // fill in colstatus - Integer[] index = null; - boolean decimal = false; - if (colType.toLowerCase().startsWith("decimal")) { - index = IExtrapolatePartStatus.indexMaps.get("decimal"); - decimal = true; - } else { - index = IExtrapolatePartStatus.indexMaps.get(colType.toLowerCase()); - } - // if the colType is not the known type, long, double, etc, then get - // all index. - if (index == null) { - index = IExtrapolatePartStatus.indexMaps.get("default"); - } - for (int colStatIndex : index) { - String colStatName = IExtrapolatePartStatus.colStatNames[colStatIndex]; - // if the aggregation type is sum, we do a scale-up - if (IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Sum) { - Object o = sumMap.get(colName).get(colStatIndex); - if (o == null) { - row[2 + colStatIndex] = null; - } else { - Long val = extractSqlLong(o); - row[2 + colStatIndex] = (Long) (val / sumVal * (partNames.size())); - } - } else if (IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Min - || IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Max) { - // if the aggregation type is min/max, we extrapolate from the - // left/right borders - if (!decimal) { - queryText = "select \"" + colStatName - + "\",\"PARTITION_NAME\" from " + PART_COL_STATS - + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" - + " order by \"" + colStatName + "\""; - } else { - queryText = "select \"" + colStatName - + "\",\"PARTITION_NAME\" from " + PART_COL_STATS - + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" - + " order by cast(\"" + colStatName + "\" as decimal)"; - } - start = doTrace ? System.nanoTime() : 0; - query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, - prepareParams(dbName, tableName, partNames, Arrays.asList(colName)), queryText); - if (qResult == null) { - query.closeAll(); - return Collections.emptyList(); - } - fqr = (ForwardQueryResult) qResult; - Object[] min = (Object[]) (fqr.get(0)); - Object[] max = (Object[]) (fqr.get(fqr.size() - 1)); - end = doTrace ? System.nanoTime() : 0; - timingTrace(doTrace, queryText, start, end); - query.closeAll(); - if (min[0] == null || max[0] == null) { - row[2 + colStatIndex] = null; - } else { - row[2 + colStatIndex] = extrapolateMethod.extrapolate(min, max, colStatIndex, - indexMap); - } - } else { - // if the aggregation type is avg, we use the average on the existing ones. - queryText = "select " - + "avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as decimal))," - + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," - + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")" - + " from " + PART_COL_STATS + "" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" - + " and \"COLUMN_NAME\" = ?" + " and \"PARTITION_NAME\" in (" - + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\""; - start = doTrace ? System.nanoTime() : 0; - query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, - prepareParams(dbName, tableName, partNames, Arrays.asList(colName)), queryText); - if (qResult == null) { - query.closeAll(); - return Collections.emptyList(); - } - fqr = (ForwardQueryResult) qResult; - Object[] avg = (Object[]) (fqr.get(0)); - // colStatIndex=12,13,14 respond to "AVG_LONG", "AVG_DOUBLE", - // "AVG_DECIMAL" - row[2 + colStatIndex] = avg[colStatIndex - 12]; - end = doTrace ? System.nanoTime() : 0; - timingTrace(doTrace, queryText, start, end); - query.closeAll(); - } - } - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner)); - Deadline.checkTimeout(); + List colStats = new ArrayList<>(); + // 1. get all the stats for colNames in partNames; + List list = getPartitionStats(dbName, tableName, partNames, colNames); + // 2. group by the stats by colNames + // map the colName to List + Map> map = new HashMap<>(); + for (ColumnStatistics css : list) { + List objs = css.getStatsObj(); + for (ColumnStatisticsObj obj : objs) { + List singleObj = new ArrayList<>(); + singleObj.add(obj); + ColumnStatistics singleCS = new ColumnStatistics(css.getStatsDesc(), singleObj); + if (!map.containsKey(obj.getColName())) { + map.put(obj.getColName(), new ArrayList()); } + map.get(obj.getColName()).add(singleCS); } - return colStats; } + // 3. aggr stats for each colName + // TODO: thread pool can be used to speed up the process + for (Entry> entry : map.entrySet()) { + List css = entry.getValue(); + ColumnStatsAggregator aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css + .iterator().next().getStatsObj().iterator().next().getStatsData().getSetField(), + useDensityFunctionForNDVEstimation, ndvTuner); + ColumnStatisticsObj statsObj = aggregator.aggregate(entry.getKey(), partNames, css); + colStats.add(statsObj); + } + return colStats; } private ColumnStatisticsObj prepareCSObj (Object[] row, int i) throws MetaException { ColumnStatisticsData data = new ColumnStatisticsData(); ColumnStatisticsObj cso = new ColumnStatisticsObj((String)row[i++], (String)row[i++], data); Object llow = row[i++], lhigh = row[i++], dlow = row[i++], dhigh = row[i++], - declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], + declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], bitVector = row[i++], avglen = row[i++], maxlen = row[i++], trues = row[i++], falses = row[i++]; StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data, - llow, lhigh, dlow, dhigh, declow, dechigh, nulls, dist, avglen, maxlen, trues, falses); - return cso; - } - - private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, - boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { - ColumnStatisticsData data = new ColumnStatisticsData(); - ColumnStatisticsObj cso = new ColumnStatisticsObj((String) row[i++], (String) row[i++], data); - Object llow = row[i++], lhigh = row[i++], dlow = row[i++], dhigh = row[i++], declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], avglen = row[i++], maxlen = row[i++], trues = row[i++], falses = row[i++], avgLong = row[i++], avgDouble = row[i++], avgDecimal = row[i++], sumDist = row[i++]; - StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data, llow, lhigh, dlow, dhigh, - declow, dechigh, nulls, dist, avglen, maxlen, trues, falses, avgLong, avgDouble, - avgDecimal, sumDist, useDensityFunctionForNDVEstimation, ndvTuner); + llow, lhigh, dlow, dhigh, declow, dechigh, nulls, dist, bitVector, avglen, maxlen, trues, falses); return cso; } @@ -1815,7 +1572,7 @@ private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, private static final String STATS_COLLIST = "\"COLUMN_NAME\", \"COLUMN_TYPE\", \"LONG_LOW_VALUE\", \"LONG_HIGH_VALUE\", " + "\"DOUBLE_LOW_VALUE\", \"DOUBLE_HIGH_VALUE\", \"BIG_DECIMAL_LOW_VALUE\", " - + "\"BIG_DECIMAL_HIGH_VALUE\", \"NUM_NULLS\", \"NUM_DISTINCTS\", \"AVG_COL_LEN\", " + + "\"BIG_DECIMAL_HIGH_VALUE\", \"NUM_NULLS\", \"NUM_DISTINCTS\", \"BIT_VECTOR\", \"AVG_COL_LEN\", " + "\"MAX_COL_LEN\", \"NUM_TRUES\", \"NUM_FALSES\", \"LAST_ANALYZED\" "; private ColumnStatistics makeColumnStats( @@ -1826,7 +1583,7 @@ private ColumnStatistics makeColumnStats( for (Object[] row : list) { // LastAnalyzed is stored per column but thrift has it per several; // get the lowest for now as nobody actually uses this field. - Object laObj = row[offset + 14]; + Object laObj = row[offset + 15]; if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > extractSqlLong(laObj))) { csd.setLastAnalyzed(extractSqlLong(laObj)); } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index b52c94c9fb..5870054008 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -71,8 +71,8 @@ import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMerger; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMergerFactory; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerFactory; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.Deserializer; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java index 2dc2804343..d2aba95282 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java @@ -76,6 +76,7 @@ public static MTableColumnStatistics convertToMTableColumnStatistics(MTable tabl mColStats.setLongStats( longStats.isSetNumNulls() ? longStats.getNumNulls() : null, longStats.isSetNumDVs() ? longStats.getNumDVs() : null, + longStats.isSetBitVectors() ? longStats.getBitVectors() : null, longStats.isSetLowValue() ? longStats.getLowValue() : null, longStats.isSetHighValue() ? longStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDoubleStats()) { @@ -83,6 +84,7 @@ public static MTableColumnStatistics convertToMTableColumnStatistics(MTable tabl mColStats.setDoubleStats( doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, + doubleStats.isSetBitVectors() ? doubleStats.getBitVectors() : null, doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDecimalStats()) { @@ -92,12 +94,14 @@ public static MTableColumnStatistics convertToMTableColumnStatistics(MTable tabl mColStats.setDecimalStats( decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null, decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null, + decimalStats.isSetBitVectors() ? decimalStats.getBitVectors() : null, low, high); } else if (statsObj.getStatsData().isSetStringStats()) { StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats(); mColStats.setStringStats( stringStats.isSetNumNulls() ? stringStats.getNumNulls() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, + stringStats.isSetBitVectors() ? stringStats.getBitVectors() : null, stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null); } else if (statsObj.getStatsData().isSetBinaryStats()) { @@ -111,6 +115,7 @@ public static MTableColumnStatistics convertToMTableColumnStatistics(MTable tabl mColStats.setDateStats( dateStats.isSetNumNulls() ? dateStats.getNumNulls() : null, dateStats.isSetNumDVs() ? dateStats.getNumDVs() : null, + dateStats.isSetBitVectors() ? dateStats.getBitVectors() : null, dateStats.isSetLowValue() ? dateStats.getLowValue().getDaysSinceEpoch() : null, dateStats.isSetHighValue() ? dateStats.getHighValue().getDaysSinceEpoch() : null); } @@ -146,6 +151,9 @@ public static void setFieldsIntoOldStats( if (mStatsObj.getNumDVs() != null) { oldStatsObj.setNumDVs(mStatsObj.getNumDVs()); } + if (mStatsObj.getBitVector() != null) { + oldStatsObj.setBitVector(mStatsObj.getBitVector()); + } if (mStatsObj.getNumFalses() != null) { oldStatsObj.setNumFalses(mStatsObj.getNumFalses()); } @@ -188,6 +196,9 @@ public static void setFieldsIntoOldStats( if (mStatsObj.getNumDVs() != null) { oldStatsObj.setNumDVs(mStatsObj.getNumDVs()); } + if (mStatsObj.getBitVector() != null) { + oldStatsObj.setBitVector(mStatsObj.getBitVector()); + } if (mStatsObj.getNumFalses() != null) { oldStatsObj.setNumFalses(mStatsObj.getNumFalses()); } @@ -220,6 +231,7 @@ public static ColumnStatisticsObj getTableColumnStatisticsObj( stringStats.setAvgColLen(mStatsObj.getAvgColLen()); stringStats.setMaxColLen(mStatsObj.getMaxColLen()); stringStats.setNumDVs(mStatsObj.getNumDVs()); + stringStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setStringStats(stringStats); } else if (colType.equals("binary")) { BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); @@ -241,6 +253,7 @@ public static ColumnStatisticsObj getTableColumnStatisticsObj( longStats.setLowValue(longLowValue); } longStats.setNumDVs(mStatsObj.getNumDVs()); + longStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setLongStats(longStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -254,6 +267,7 @@ public static ColumnStatisticsObj getTableColumnStatisticsObj( doubleStats.setLowValue(doubleLowValue); } doubleStats.setNumDVs(mStatsObj.getNumDVs()); + doubleStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); @@ -267,6 +281,7 @@ public static ColumnStatisticsObj getTableColumnStatisticsObj( decimalStats.setLowValue(createThriftDecimal(decimalLowValue)); } decimalStats.setNumDVs(mStatsObj.getNumDVs()); + decimalStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setDecimalStats(decimalStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -280,6 +295,7 @@ public static ColumnStatisticsObj getTableColumnStatisticsObj( dateStats.setLowValue(new Date(lowValue)); } dateStats.setNumDVs(mStatsObj.getNumDVs()); + dateStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setDateStats(dateStats); } statsObj.setStatsData(colStatsData); @@ -323,6 +339,7 @@ public static MPartitionColumnStatistics convertToMPartitionColumnStatistics( mColStats.setLongStats( longStats.isSetNumNulls() ? longStats.getNumNulls() : null, longStats.isSetNumDVs() ? longStats.getNumDVs() : null, + longStats.isSetBitVectors() ? longStats.getBitVectors() : null, longStats.isSetLowValue() ? longStats.getLowValue() : null, longStats.isSetHighValue() ? longStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDoubleStats()) { @@ -330,6 +347,7 @@ public static MPartitionColumnStatistics convertToMPartitionColumnStatistics( mColStats.setDoubleStats( doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, + doubleStats.isSetBitVectors() ? doubleStats.getBitVectors() : null, doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDecimalStats()) { @@ -339,12 +357,14 @@ public static MPartitionColumnStatistics convertToMPartitionColumnStatistics( mColStats.setDecimalStats( decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null, decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null, + decimalStats.isSetBitVectors() ? decimalStats.getBitVectors() : null, low, high); } else if (statsObj.getStatsData().isSetStringStats()) { StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats(); mColStats.setStringStats( stringStats.isSetNumNulls() ? stringStats.getNumNulls() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, + stringStats.isSetBitVectors() ? stringStats.getBitVectors() : null, stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null); } else if (statsObj.getStatsData().isSetBinaryStats()) { @@ -358,6 +378,7 @@ public static MPartitionColumnStatistics convertToMPartitionColumnStatistics( mColStats.setDateStats( dateStats.isSetNumNulls() ? dateStats.getNumNulls() : null, dateStats.isSetNumDVs() ? dateStats.getNumDVs() : null, + dateStats.isSetBitVectors() ? dateStats.getBitVectors() : null, dateStats.isSetLowValue() ? dateStats.getLowValue().getDaysSinceEpoch() : null, dateStats.isSetHighValue() ? dateStats.getHighValue().getDaysSinceEpoch() : null); } @@ -385,6 +406,7 @@ public static ColumnStatisticsObj getPartitionColumnStatisticsObj( stringStats.setAvgColLen(mStatsObj.getAvgColLen()); stringStats.setMaxColLen(mStatsObj.getMaxColLen()); stringStats.setNumDVs(mStatsObj.getNumDVs()); + stringStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setStringStats(stringStats); } else if (colType.equals("binary")) { BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); @@ -404,6 +426,7 @@ public static ColumnStatisticsObj getPartitionColumnStatisticsObj( longStats.setLowValue(mStatsObj.getLongLowValue()); } longStats.setNumDVs(mStatsObj.getNumDVs()); + longStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setLongStats(longStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -415,6 +438,7 @@ public static ColumnStatisticsObj getPartitionColumnStatisticsObj( doubleStats.setLowValue(mStatsObj.getDoubleLowValue()); } doubleStats.setNumDVs(mStatsObj.getNumDVs()); + doubleStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); @@ -426,6 +450,7 @@ public static ColumnStatisticsObj getPartitionColumnStatisticsObj( decimalStats.setLowValue(createThriftDecimal(mStatsObj.getDecimalLowValue())); } decimalStats.setNumDVs(mStatsObj.getNumDVs()); + decimalStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setDecimalStats(decimalStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -433,6 +458,7 @@ public static ColumnStatisticsObj getPartitionColumnStatisticsObj( dateStats.setHighValue(new Date(mStatsObj.getLongHighValue())); dateStats.setLowValue(new Date(mStatsObj.getLongLowValue())); dateStats.setNumDVs(mStatsObj.getNumDVs()); + dateStats.setBitVectors(mStatsObj.getBitVector()); colStatsData.setDateStats(dateStats); } statsObj.setStatsData(colStatsData); @@ -453,7 +479,7 @@ public static ColumnStatisticsDesc getPartitionColumnStatisticsDesc( // SQL public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, - Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses) throws MetaException { + Object nulls, Object dist, Object bitVector, Object avglen, Object maxlen, Object trues, Object falses) throws MetaException { colType = colType.toLowerCase(); if (colType.equals("boolean")) { BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); @@ -468,6 +494,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData stringStats.setAvgColLen(MetaStoreDirectSql.extractSqlDouble(avglen)); stringStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen)); stringStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + stringStats.setBitVectors((String) bitVector); data.setStringStats(stringStats); } else if (colType.equals("binary")) { BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); @@ -487,6 +514,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData longStats.setLowValue(MetaStoreDirectSql.extractSqlLong(llow)); } longStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + longStats.setBitVectors((String) bitVector); data.setLongStats(longStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -498,6 +526,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData doubleStats.setLowValue(MetaStoreDirectSql.extractSqlDouble(dlow)); } doubleStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + doubleStats.setBitVectors((String) bitVector); data.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); @@ -509,6 +538,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData decimalStats.setLowValue(createThriftDecimal((String)declow)); } decimalStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + decimalStats.setBitVectors((String) bitVector); data.setDecimalStats(decimalStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -520,178 +550,11 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData dateStats.setLowValue(new Date(MetaStoreDirectSql.extractSqlLong(llow))); } dateStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + dateStats.setBitVectors((String) bitVector); data.setDateStats(dateStats); } } - public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, - Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, - Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, - Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist, - boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { - colType = colType.toLowerCase(); - if (colType.equals("boolean")) { - BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); - boolStats.setNumFalses(MetaStoreDirectSql.extractSqlLong(falses)); - boolStats.setNumTrues(MetaStoreDirectSql.extractSqlLong(trues)); - boolStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - data.setBooleanStats(boolStats); - } else if (colType.equals("string") || colType.startsWith("varchar") - || colType.startsWith("char")) { - StringColumnStatsData stringStats = new StringColumnStatsData(); - stringStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - stringStats.setAvgColLen(MetaStoreDirectSql.extractSqlDouble(avglen)); - stringStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen)); - stringStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); - data.setStringStats(stringStats); - } else if (colType.equals("binary")) { - BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); - binaryStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - binaryStats.setAvgColLen(MetaStoreDirectSql.extractSqlDouble(avglen)); - binaryStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen)); - data.setBinaryStats(binaryStats); - } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") - || colType.equals("tinyint") || colType.equals("timestamp")) { - LongColumnStatsData longStats = new LongColumnStatsData(); - longStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - if (lhigh != null) { - longStats.setHighValue(MetaStoreDirectSql.extractSqlLong(lhigh)); - } - if (llow != null) { - longStats.setLowValue(MetaStoreDirectSql.extractSqlLong(llow)); - } - long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); - long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); - long rangeBound = Long.MAX_VALUE; - if (lhigh != null && llow != null) { - rangeBound = MetaStoreDirectSql.extractSqlLong(lhigh) - - MetaStoreDirectSql.extractSqlLong(llow) + 1; - } - long estimation; - if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null - && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) { - // We have estimation, lowerbound and higherbound. We use estimation if - // it is between lowerbound and higherbound. - estimation = MetaStoreDirectSql - .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql - .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong)); - if (estimation < lowerBound) { - estimation = lowerBound; - } else if (estimation > higherBound) { - estimation = higherBound; - } - } else { - estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); - } - estimation = Math.min(estimation, rangeBound); - longStats.setNumDVs(estimation); - data.setLongStats(longStats); - } else if (colType.equals("date")) { - DateColumnStatsData dateStats = new DateColumnStatsData(); - dateStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - if (lhigh != null) { - dateStats.setHighValue(new Date(MetaStoreDirectSql.extractSqlLong(lhigh))); - } - if (llow != null) { - dateStats.setLowValue(new Date(MetaStoreDirectSql.extractSqlLong(llow))); - } - long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); - long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); - long rangeBound = Long.MAX_VALUE; - if (lhigh != null && llow != null) { - rangeBound = MetaStoreDirectSql.extractSqlLong(lhigh) - - MetaStoreDirectSql.extractSqlLong(llow) + 1; - } - long estimation; - if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null - && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) { - // We have estimation, lowerbound and higherbound. We use estimation if - // it is between lowerbound and higherbound. - estimation = MetaStoreDirectSql - .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql - .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong)); - if (estimation < lowerBound) { - estimation = lowerBound; - } else if (estimation > higherBound) { - estimation = higherBound; - } - } else { - estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); - } - estimation = Math.min(estimation, rangeBound); - dateStats.setNumDVs(estimation); - data.setDateStats(dateStats); - } else if (colType.equals("double") || colType.equals("float")) { - DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); - doubleStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - if (dhigh != null) { - doubleStats.setHighValue(MetaStoreDirectSql.extractSqlDouble(dhigh)); - } - if (dlow != null) { - doubleStats.setLowValue(MetaStoreDirectSql.extractSqlDouble(dlow)); - } - long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); - long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); - if (useDensityFunctionForNDVEstimation && dhigh != null && dlow != null && avgDouble != null - && MetaStoreDirectSql.extractSqlDouble(avgDouble) != 0.0) { - long estimation = MetaStoreDirectSql - .extractSqlLong((MetaStoreDirectSql.extractSqlLong(dhigh) - MetaStoreDirectSql - .extractSqlLong(dlow)) / MetaStoreDirectSql.extractSqlDouble(avgDouble)); - if (estimation < lowerBound) { - doubleStats.setNumDVs(lowerBound); - } else if (estimation > higherBound) { - doubleStats.setNumDVs(higherBound); - } else { - doubleStats.setNumDVs(estimation); - } - } else { - doubleStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner)); - } - data.setDoubleStats(doubleStats); - } else if (colType.startsWith("decimal")) { - DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); - decimalStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); - Decimal low = null; - Decimal high = null; - BigDecimal blow = null; - BigDecimal bhigh = null; - if (dechigh instanceof BigDecimal) { - bhigh = (BigDecimal) dechigh; - high = new Decimal(ByteBuffer.wrap(bhigh.unscaledValue().toByteArray()), - (short) bhigh.scale()); - } else if (dechigh instanceof String) { - bhigh = new BigDecimal((String) dechigh); - high = createThriftDecimal((String) dechigh); - } - decimalStats.setHighValue(high); - if (declow instanceof BigDecimal) { - blow = (BigDecimal) declow; - low = new Decimal(ByteBuffer.wrap(blow.unscaledValue().toByteArray()), (short) blow.scale()); - } else if (dechigh instanceof String) { - blow = new BigDecimal((String) declow); - low = createThriftDecimal((String) declow); - } - decimalStats.setLowValue(low); - long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); - long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); - if (useDensityFunctionForNDVEstimation && dechigh != null && declow != null && avgDecimal != null - && MetaStoreDirectSql.extractSqlDouble(avgDecimal) != 0.0) { - long estimation = MetaStoreDirectSql.extractSqlLong(MetaStoreDirectSql.extractSqlLong(bhigh - .subtract(blow).floatValue() / MetaStoreDirectSql.extractSqlDouble(avgDecimal))); - if (estimation < lowerBound) { - decimalStats.setNumDVs(lowerBound); - } else if (estimation > higherBound) { - decimalStats.setNumDVs(higherBound); - } else { - decimalStats.setNumDVs(estimation); - } - } else { - decimalStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner)); - } - data.setDecimalStats(decimalStats); - } - } - public static Decimal createThriftDecimal(String s) { BigDecimal d = new BigDecimal(s); return new Decimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short)d.scale()); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java index 3ac4fe1604..34a32715ea 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java @@ -80,8 +80,8 @@ import org.apache.hadoop.hive.metastore.api.UnknownDBException; import org.apache.hadoop.hive.metastore.api.UnknownPartitionException; import org.apache.hadoop.hive.metastore.api.UnknownTableException; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMerger; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMergerFactory; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerFactory; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java similarity index 97% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java index d81d612e92..e6c836b183 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.List; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java similarity index 97% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java index e796df2422..a34bc9f38b 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.List; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java similarity index 93% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java index 29a05390bf..f5ebc35fb3 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.List; @@ -28,6 +28,8 @@ public abstract class ColumnStatsAggregator { public boolean useDensityFunctionForNDVEstimation; + public double ndvTuner; + public abstract ColumnStatisticsObj aggregate(String colName, List partNames, List css) throws MetaException; } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregatorFactory.java similarity index 89% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregatorFactory.java index 568bf0609b..173e06fe8e 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregatorFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; @@ -34,7 +35,8 @@ private ColumnStatsAggregatorFactory() { } - public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, boolean useDensityFunctionForNDVEstimation) { + public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, + boolean useDensityFunctionForNDVEstimation, double ndvTuner) { ColumnStatsAggregator agg; switch (type) { case BOOLEAN_STATS: @@ -43,6 +45,9 @@ public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, boole case LONG_STATS: agg = new LongColumnStatsAggregator(); break; + case DATE_STATS: + agg = new DateColumnStatsAggregator(); + break; case DOUBLE_STATS: agg = new DoubleColumnStatsAggregator(); break; @@ -59,6 +64,7 @@ public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, boole throw new RuntimeException("Woh, bad. Unknown stats type " + type.toString()); } agg.useDensityFunctionForNDVEstimation = useDensityFunctionForNDVEstimation; + agg.ndvTuner = ndvTuner; return agg; } @@ -76,6 +82,10 @@ public static ColumnStatisticsObj newColumnStaticsObj(String colName, String col csd.setLongStats(new LongColumnStatsData()); break; + case DATE_STATS: + csd.setDateStats(new DateColumnStatsData()); + break; + case DOUBLE_STATS: csd.setDoubleStats(new DoubleColumnStatsData()); break; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java new file mode 100644 index 0000000000..e4d973289a --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java @@ -0,0 +1,356 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.metastore.columnstats.aggr; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DateColumnStatsAggregator extends ColumnStatsAggregator implements + IExtrapolatePartStatus { + + private static final Logger LOG = LoggerFactory.getLogger(DateColumnStatsAggregator.class); + + @Override + public ColumnStatisticsObj aggregate(String colName, List partNames, + List css) throws MetaException { + ColumnStatisticsObj statsObj = null; + + // check if all the ColumnStatisticsObjs contain stats and all the ndv are + // bitvectors + boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.info("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); + NumDistinctValueEstimator ndvEstimator = null; + String colType = null; + for (ColumnStatistics cs : css) { + if (cs.getStatsObjSize() != 1) { + throw new MetaException( + "The number of columns should be exactly one in aggrStats, but found " + + cs.getStatsObjSize()); + } + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + if (statsObj == null) { + colType = cso.getColType(); + statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso + .getStatsData().getSetField()); + } + if (!cso.getStatsData().getDateStats().isSetBitVectors() + || cso.getStatsData().getDateStats().getBitVectors().length() == 0) { + ndvEstimator = null; + break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getDateStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } + } + } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } + LOG.info("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); + ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); + if (doAllPartitionContainStats || css.size() < 2) { + DateColumnStatsData aggregateData = null; + long lowerBound = 0; + long higherBound = 0; + double densityAvgSum = 0.0; + for (ColumnStatistics cs : css) { + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + DateColumnStatsData newData = cso.getStatsData().getDateStats(); + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (diff(newData.getHighValue(), newData.getLowValue())) + / newData.getNumDVs(); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue())); + aggregateData + .setHighValue(max(aggregateData.getHighValue(), newData.getHighValue())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); + } + } + if (ndvEstimator != null) { + // if all the ColumnStatisticsObjs contain bitvectors, we do not need to + // use uniform distribution assumption because we can merge bitvectors + // to get a good estimation. + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + } else { + long estimation; + if (useDensityFunctionForNDVEstimation) { + // We have estimation, lowerbound and higherbound. We use estimation + // if it is between lowerbound and higherbound. + double densityAvg = densityAvgSum / partNames.size(); + estimation = (long) (diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / densityAvg); + if (estimation < lowerBound) { + estimation = lowerBound; + } else if (estimation > higherBound) { + estimation = higherBound; + } + } else { + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); + } + aggregateData.setNumDVs(estimation); + } + columnStatisticsData.setDateStats(aggregateData); + } else { + // we need extrapolation + LOG.info("start extrapolation for " + colName); + + Map indexMap = new HashMap(); + for (int index = 0; index < partNames.size(); index++) { + indexMap.put(partNames.get(index), index); + } + Map adjustedIndexMap = new HashMap(); + Map adjustedStatsMap = new HashMap(); + // while we scan the css, we also get the densityAvg, lowerbound and + // higerbound when useDensityFunctionForNDVEstimation is true. + double densityAvgSum = 0.0; + if (ndvEstimator == null) { + // if not every partition uses bitvector for ndv, we just fall back to + // the traditional extrapolation methods. + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + DateColumnStatsData newData = cso.getStatsData().getDateStats(); + if (useDensityFunctionForNDVEstimation) { + densityAvgSum += diff(newData.getHighValue(), newData.getLowValue()) / newData.getNumDVs(); + } + adjustedIndexMap.put(partName, (double) indexMap.get(partName)); + adjustedStatsMap.put(partName, cso.getStatsData()); + } + } else { + // we first merge all the adjacent bitvectors that we could merge and + // derive new partition names and index. + StringBuilder pseudoPartName = new StringBuilder(); + double pseudoIndexSum = 0; + int length = 0; + int curIndex = -1; + DateColumnStatsData aggregateData = null; + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + DateColumnStatsData newData = cso.getStatsData().getDateStats(); + // newData.isSetBitVectors() should be true for sure because we + // already checked it before. + if (indexMap.get(partName) != curIndex) { + // There is bitvector, but it is not adjacent to the previous ones. + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setDateStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + if (useDensityFunctionForNDVEstimation) { + densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) + / aggregateData.getNumDVs(); + } + // reset everything + pseudoPartName = new StringBuilder(); + pseudoIndexSum = 0; + length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); + } + aggregateData = null; + } + curIndex = indexMap.get(partName); + pseudoPartName.append(partName); + pseudoIndexSum += curIndex; + length++; + curIndex++; + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue())); + aggregateData.setHighValue(max(aggregateData.getHighValue(), newData.getHighValue())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + } + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setDateStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + if (useDensityFunctionForNDVEstimation) { + densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) + / aggregateData.getNumDVs(); + } + } + } + extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, + adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); + } + statsObj.setStatsData(columnStatisticsData); + return statsObj; + } + + private long diff(Date d1, Date d2) { + return d1.getDaysSinceEpoch() - d2.getDaysSinceEpoch(); + } + + private Date min(Date d1, Date d2) { + return d1.compareTo(d2) < 0 ? d1 : d2; + } + + private Date max(Date d1, Date d2) { + return d1.compareTo(d2) < 0 ? d2 : d1; + } + + @Override + public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, + int numPartsWithStats, Map adjustedIndexMap, + Map adjustedStatsMap, double densityAvg) { + int rightBorderInd = numParts; + DateColumnStatsData extrapolateDateData = new DateColumnStatsData(); + Map extractedAdjustedStatsMap = new HashMap<>(); + for (Map.Entry entry : adjustedStatsMap.entrySet()) { + extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDateStats()); + } + List> list = new LinkedList>( + extractedAdjustedStatsMap.entrySet()); + // get the lowValue + Collections.sort(list, new Comparator>() { + public int compare(Map.Entry o1, + Map.Entry o2) { + return diff(o1.getValue().getLowValue(), o2.getValue().getLowValue()) < 0 ? -1 : 1; + } + }); + double minInd = adjustedIndexMap.get(list.get(0).getKey()); + double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + long lowValue = 0; + long min = list.get(0).getValue().getLowValue().getDaysSinceEpoch(); + long max = list.get(list.size() - 1).getValue().getLowValue().getDaysSinceEpoch(); + if (minInd == maxInd) { + lowValue = min; + } else if (minInd < maxInd) { + // left border is the min + lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd)); + } else { + // right border is the min + lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd)); + } + + // get the highValue + Collections.sort(list, new Comparator>() { + public int compare(Map.Entry o1, + Map.Entry o2) { + return diff(o1.getValue().getHighValue(), o2.getValue().getHighValue()) < 0 ? -1 : 1; + } + }); + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + long highValue = 0; + min = list.get(0).getValue().getHighValue().getDaysSinceEpoch(); + max = list.get(list.size() - 1).getValue().getHighValue().getDaysSinceEpoch(); + if (minInd == maxInd) { + highValue = min; + } else if (minInd < maxInd) { + // right border is the max + highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + highValue = (long) (min + (max - min) * minInd / (minInd - maxInd)); + } + + // get the #nulls + long numNulls = 0; + for (Map.Entry entry : extractedAdjustedStatsMap.entrySet()) { + numNulls += entry.getValue().getNumNulls(); + } + // we scale up sumNulls based on the number of partitions + numNulls = numNulls * numParts / numPartsWithStats; + + // get the ndv + long ndv = 0; + Collections.sort(list, new Comparator>() { + public int compare(Map.Entry o1, + Map.Entry o2) { + return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + } + }); + long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); + long higherBound = 0; + for (Map.Entry entry : list) { + higherBound += entry.getValue().getNumDVs(); + } + if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) { + ndv = (long) ((highValue - lowValue) / densityAvg); + if (ndv < lowerBound) { + ndv = lowerBound; + } else if (ndv > higherBound) { + ndv = higherBound; + } + } else { + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + min = list.get(0).getValue().getNumDVs(); + max = list.get(list.size() - 1).getValue().getNumDVs(); + if (minInd == maxInd) { + ndv = min; + } else if (minInd < maxInd) { + // right border is the max + ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + ndv = (long) (min + (max - min) * minInd / (minInd - maxInd)); + } + } + extrapolateDateData.setLowValue(new Date(lowValue)); + extrapolateDateData.setHighValue(new Date(highValue)); + extrapolateDateData.setNumNulls(numNulls); + extrapolateDateData.setNumDVs(ndv); + extrapolateData.setDateStats(extrapolateDateData); + } +} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java similarity index 93% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java index 8eb64e0143..b230ba3f94 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Collections; import java.util.Comparator; @@ -35,9 +35,13 @@ import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.hbase.HBaseUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { + + private static final Logger LOG = LoggerFactory.getLogger(DecimalColumnStatsAggregator.class); @Override public ColumnStatisticsObj aggregate(String colName, List partNames, @@ -47,6 +51,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.info("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { @@ -85,6 +90,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, ndvEstimator = NumDistinctValueEstimatorFactory .getEmptyNumDistinctValueEstimator(ndvEstimator); } + LOG.info("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { DecimalColumnStatsData aggregateData = null; @@ -94,12 +100,10 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats(); - if (useDensityFunctionForNDVEstimation) { - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils - .getDoubleValue(newData.getLowValue())) / newData.getNumDVs(); - } + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils + .getDoubleValue(newData.getLowValue())) / newData.getNumDVs(); if (ndvEstimator != null) { ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory .getNumDistinctValueEstimator(newData.getBitVectors())); @@ -129,28 +133,27 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, // to get a good estimation. aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); } else { + long estimation; if (useDensityFunctionForNDVEstimation) { // We have estimation, lowerbound and higherbound. We use estimation // if it is between lowerbound and higherbound. double densityAvg = densityAvgSum / partNames.size(); - long estimation = (long) ((HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils + estimation = (long) ((HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils .getDoubleValue(aggregateData.getLowValue())) / densityAvg); if (estimation < lowerBound) { - aggregateData.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - aggregateData.setNumDVs(higherBound); - } else { - aggregateData.setNumDVs(estimation); + estimation = higherBound; } } else { - // Without useDensityFunctionForNDVEstimation, we just use the - // default one, which is the max of all the partitions and it is - // already done. + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + aggregateData.setNumDVs(estimation); } columnStatisticsData.setDecimalStats(aggregateData); } else { // we need extrapolation + LOG.info("start extrapolation for " + colName); Map indexMap = new HashMap(); for (int index = 0; index < partNames.size(); index++) { indexMap.put(partNames.get(index), index); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java similarity index 93% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java index b6b86123b2..7d9db53332 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Collections; import java.util.Comparator; @@ -33,10 +33,14 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { + private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); + @Override public ColumnStatisticsObj aggregate(String colName, List partNames, List css) throws MetaException { @@ -45,6 +49,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.info("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { @@ -83,6 +88,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, ndvEstimator = NumDistinctValueEstimatorFactory .getEmptyNumDistinctValueEstimator(ndvEstimator); } + LOG.info("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { DoubleColumnStatsData aggregateData = null; @@ -92,11 +98,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats(); - if (useDensityFunctionForNDVEstimation) { - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - } + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); if (ndvEstimator != null) { ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory .getNumDistinctValueEstimator(newData.getBitVectors())); @@ -117,27 +121,26 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, // to get a good estimation. aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); } else { + long estimation; if (useDensityFunctionForNDVEstimation) { // We have estimation, lowerbound and higherbound. We use estimation // if it is between lowerbound and higherbound. double densityAvg = densityAvgSum / partNames.size(); - long estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); + estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); if (estimation < lowerBound) { - aggregateData.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - aggregateData.setNumDVs(higherBound); - } else { - aggregateData.setNumDVs(estimation); + estimation = higherBound; } } else { - // Without useDensityFunctionForNDVEstimation, we just use the - // default one, which is the max of all the partitions and it is - // already done. + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + aggregateData.setNumDVs(estimation); } columnStatisticsData.setDoubleStats(aggregateData); } else { // we need extrapolation + LOG.info("start extrapolation for " + colName); Map indexMap = new HashMap(); for (int index = 0; index < partNames.size(); index++) { indexMap.put(partNames.get(index), index); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java similarity index 96% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java index af75bced72..acf679e1c3 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Map; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java similarity index 93% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java index 2da6f60167..f28a4f2b3a 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Collections; import java.util.Comparator; @@ -33,10 +33,14 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class LongColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { + private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); + @Override public ColumnStatisticsObj aggregate(String colName, List partNames, List css) throws MetaException { @@ -45,6 +49,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.info("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { @@ -83,6 +88,7 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, ndvEstimator = NumDistinctValueEstimatorFactory .getEmptyNumDistinctValueEstimator(ndvEstimator); } + LOG.info("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { LongColumnStatsData aggregateData = null; @@ -92,11 +98,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); LongColumnStatsData newData = cso.getStatsData().getLongStats(); - if (useDensityFunctionForNDVEstimation) { - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - } + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); if (ndvEstimator != null) { ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory .getNumDistinctValueEstimator(newData.getBitVectors())); @@ -117,27 +121,27 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, // to get a good estimation. aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); } else { + long estimation; if (useDensityFunctionForNDVEstimation) { // We have estimation, lowerbound and higherbound. We use estimation // if it is between lowerbound and higherbound. double densityAvg = densityAvgSum / partNames.size(); - long estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); + estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); if (estimation < lowerBound) { - aggregateData.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - aggregateData.setNumDVs(higherBound); - } else { - aggregateData.setNumDVs(estimation); + estimation = higherBound; } } else { - // Without useDensityFunctionForNDVEstimation, we just use the - // default one, which is the max of all the partitions and it is - // already done. + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + aggregateData.setNumDVs(estimation); } columnStatisticsData.setLongStats(aggregateData); } else { // we need extrapolation + LOG.info("start extrapolation for " + colName); + Map indexMap = new HashMap(); for (int index = 0; index < partNames.size(); index++) { indexMap.put(partNames.get(index), index); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java new file mode 100644 index 0000000000..cf5a895881 --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java @@ -0,0 +1,301 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.metastore.columnstats.aggr; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class StringColumnStatsAggregator extends ColumnStatsAggregator implements + IExtrapolatePartStatus { + + private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); + + @Override + public ColumnStatisticsObj aggregate(String colName, List partNames, + List css) throws MetaException { + ColumnStatisticsObj statsObj = null; + + // check if all the ColumnStatisticsObjs contain stats and all the ndv are + // bitvectors. Only when both of the conditions are true, we merge bit + // vectors. Otherwise, just use the maximum function. + boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.info("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); + NumDistinctValueEstimator ndvEstimator = null; + String colType = null; + for (ColumnStatistics cs : css) { + if (cs.getStatsObjSize() != 1) { + throw new MetaException( + "The number of columns should be exactly one in aggrStats, but found " + + cs.getStatsObjSize()); + } + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + if (statsObj == null) { + colType = cso.getColType(); + statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso + .getStatsData().getSetField()); + } + if (!cso.getStatsData().getStringStats().isSetBitVectors() + || cso.getStatsData().getStringStats().getBitVectors().length() == 0) { + ndvEstimator = null; + break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getStringStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } + } + } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } + LOG.info("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); + ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); + if (doAllPartitionContainStats || css.size() < 2) { + StringColumnStatsData aggregateData = null; + for (ColumnStatistics cs : css) { + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + StringColumnStatsData newData = cso.getStatsData().getStringStats(); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData + .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); + aggregateData + .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); + } + } + if (ndvEstimator != null) { + // if all the ColumnStatisticsObjs contain bitvectors, we do not need to + // use uniform distribution assumption because we can merge bitvectors + // to get a good estimation. + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + } else { + // aggregateData already has the ndv of the max of all + } + columnStatisticsData.setStringStats(aggregateData); + } else { + // we need extrapolation + LOG.info("start extrapolation for " + colName); + + Map indexMap = new HashMap(); + for (int index = 0; index < partNames.size(); index++) { + indexMap.put(partNames.get(index), index); + } + Map adjustedIndexMap = new HashMap(); + Map adjustedStatsMap = new HashMap(); + if (ndvEstimator == null) { + // if not every partition uses bitvector for ndv, we just fall back to + // the traditional extrapolation methods. + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + StringColumnStatsData newData = cso.getStatsData().getStringStats(); + adjustedIndexMap.put(partName, (double) indexMap.get(partName)); + adjustedStatsMap.put(partName, cso.getStatsData()); + } + } else { + // we first merge all the adjacent bitvectors that we could merge and + // derive new partition names and index. + StringBuilder pseudoPartName = new StringBuilder(); + double pseudoIndexSum = 0; + int length = 0; + int curIndex = -1; + StringColumnStatsData aggregateData = null; + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + StringColumnStatsData newData = cso.getStatsData().getStringStats(); + // newData.isSetBitVectors() should be true for sure because we + // already checked it before. + if (indexMap.get(partName) != curIndex) { + // There is bitvector, but it is not adjacent to the previous ones. + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setStringStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + // reset everything + pseudoPartName = new StringBuilder(); + pseudoIndexSum = 0; + length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } + aggregateData = null; + } + curIndex = indexMap.get(partName); + pseudoPartName.append(partName); + pseudoIndexSum += curIndex; + length++; + curIndex++; + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData.setAvgColLen(Math.min(aggregateData.getAvgColLen(), + newData.getAvgColLen())); + aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), + newData.getMaxColLen())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + } + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setStringStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + } + } + extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, + adjustedStatsMap, -1); + } + statsObj.setStatsData(columnStatisticsData); + return statsObj; + } + + @Override + public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, + int numPartsWithStats, Map adjustedIndexMap, + Map adjustedStatsMap, double densityAvg) { + int rightBorderInd = numParts; + StringColumnStatsData extrapolateStringData = new StringColumnStatsData(); + Map extractedAdjustedStatsMap = new HashMap<>(); + for (Map.Entry entry : adjustedStatsMap.entrySet()) { + extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getStringStats()); + } + List> list = new LinkedList>( + extractedAdjustedStatsMap.entrySet()); + // get the avgLen + Collections.sort(list, new Comparator>() { + public int compare(Map.Entry o1, + Map.Entry o2) { + return o1.getValue().getAvgColLen() < o2.getValue().getAvgColLen() ? -1 : 1; + } + }); + double minInd = adjustedIndexMap.get(list.get(0).getKey()); + double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + double avgColLen = 0; + double min = list.get(0).getValue().getAvgColLen(); + double max = list.get(list.size() - 1).getValue().getAvgColLen(); + if (minInd == maxInd) { + avgColLen = min; + } else if (minInd < maxInd) { + // right border is the max + avgColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + avgColLen = (min + (max - min) * minInd / (minInd - maxInd)); + } + + // get the maxLen + Collections.sort(list, new Comparator>() { + public int compare(Map.Entry o1, + Map.Entry o2) { + return o1.getValue().getMaxColLen() < o2.getValue().getMaxColLen() ? -1 : 1; + } + }); + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + double maxColLen = 0; + min = list.get(0).getValue().getAvgColLen(); + max = list.get(list.size() - 1).getValue().getAvgColLen(); + if (minInd == maxInd) { + maxColLen = min; + } else if (minInd < maxInd) { + // right border is the max + maxColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + maxColLen = (min + (max - min) * minInd / (minInd - maxInd)); + } + + // get the #nulls + long numNulls = 0; + for (Map.Entry entry : extractedAdjustedStatsMap.entrySet()) { + numNulls += entry.getValue().getNumNulls(); + } + // we scale up sumNulls based on the number of partitions + numNulls = numNulls * numParts / numPartsWithStats; + + // get the ndv + long ndv = 0; + Collections.sort(list, new Comparator>() { + public int compare(Map.Entry o1, + Map.Entry o2) { + return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + } + }); + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + min = list.get(0).getValue().getNumDVs(); + max = list.get(list.size() - 1).getValue().getNumDVs(); + if (minInd == maxInd) { + ndv = (long) min; + } else if (minInd < maxInd) { + // right border is the max + ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + ndv = (long) (min + (max - min) * minInd / (minInd - maxInd)); + } + extrapolateStringData.setAvgColLen(avgColLen); + ; + extrapolateStringData.setMaxColLen((long) maxColLen); + extrapolateStringData.setNumNulls(numNulls); + extrapolateStringData.setNumDVs(ndv); + extrapolateData.setStringStats(extrapolateStringData); + } + +} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/BinaryColumnStatsMerger.java similarity index 96% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/BinaryColumnStatsMerger.java index af0669eb65..4c2d1bc602 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/BinaryColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/BooleanColumnStatsMerger.java similarity index 96% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/BooleanColumnStatsMerger.java index 33ff6a19f5..8e5015323f 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/BooleanColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMerger.java similarity index 95% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMerger.java index d3051a2b00..474d4ddcd1 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.slf4j.Logger; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java similarity index 98% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java index c013ba5c5d..0ce1847d1c 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DateColumnStatsMerger.java similarity index 98% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DateColumnStatsMerger.java index e899bfe85f..2542a00d36 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DateColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java similarity index 98% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java index 4099ffcace..4e8e129758 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DoubleColumnStatsMerger.java similarity index 97% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DoubleColumnStatsMerger.java index 1691fc97df..4ef5c39d1c 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/DoubleColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/LongColumnStatsMerger.java similarity index 97% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/LongColumnStatsMerger.java index 361af350fe..acf7f03c72 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/LongColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/StringColumnStatsMerger.java similarity index 97% rename from metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java rename to metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/StringColumnStatsMerger.java index 8e28f907ee..b3cd33c671 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/StringColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java index 0e119896a5..78a962a0e6 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java @@ -32,8 +32,8 @@ import org.apache.hadoop.hive.metastore.api.AggrStats; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.hbase.stats.ColumnStatsAggregator; -import org.apache.hadoop.hive.metastore.hbase.stats.ColumnStatsAggregatorFactory; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregatorFactory; import java.io.IOException; import java.security.MessageDigest; @@ -84,7 +84,10 @@ private StatsCache(final Configuration conf) { .build(new CacheLoader() { @Override public AggrStats load(StatsCacheKey key) throws Exception { - boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + double ndvTuner = HiveConf.getFloatVar(conf, + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER); HBaseReadWrite hrw = HBaseReadWrite.getInstance(); AggrStats aggrStats = hrw.getAggregatedStats(key.hashed); if (aggrStats == null) { @@ -100,7 +103,7 @@ public AggrStats load(StatsCacheKey key) throws Exception { if (aggregator == null) { aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css.iterator() .next().getStatsObj().iterator().next().getStatsData().getSetField(), - useDensityFunctionForNDVEstimation); + useDensityFunctionForNDVEstimation, ndvTuner); } ColumnStatisticsObj statsObj = aggregator .aggregate(key.colName, key.partNames, css); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java deleted file mode 100644 index 83c6c54fd2..0000000000 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.metastore.hbase.stats; - -import java.util.List; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.ColumnStatistics; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; - -public class StringColumnStatsAggregator extends ColumnStatsAggregator { - - @Override - public ColumnStatisticsObj aggregate(String colName, List partNames, - List css) throws MetaException { - ColumnStatisticsObj statsObj = null; - - // check if all the ColumnStatisticsObjs contain stats and all the ndv are - // bitvectors. Only when both of the conditions are true, we merge bit - // vectors. Otherwise, just use the maximum function. - boolean doAllPartitionContainStats = partNames.size() == css.size(); - NumDistinctValueEstimator ndvEstimator = null; - String colType = null; - for (ColumnStatistics cs : css) { - if (cs.getStatsObjSize() != 1) { - throw new MetaException( - "The number of columns should be exactly one in aggrStats, but found " - + cs.getStatsObjSize()); - } - ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); - if (statsObj == null) { - colType = cso.getColType(); - statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso - .getStatsData().getSetField()); - } - if (!cso.getStatsData().getStringStats().isSetBitVectors() - || cso.getStatsData().getStringStats().getBitVectors().length() == 0) { - ndvEstimator = null; - break; - } else { - // check if all of the bit vectors can merge - NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(cso.getStatsData().getStringStats().getBitVectors()); - if (ndvEstimator == null) { - ndvEstimator = estimator; - } else { - if (ndvEstimator.canMerge(estimator)) { - continue; - } else { - ndvEstimator = null; - break; - } - } - } - } - if (ndvEstimator != null) { - ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); - } - ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats && ndvEstimator!=null) { - StringColumnStatsData aggregateData = null; - for (ColumnStatistics cs : css) { - ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); - StringColumnStatsData newData = cso.getStatsData().getStringStats(); - ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(newData.getBitVectors())); - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData - .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); - aggregateData - .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - } - } - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - columnStatisticsData.setStringStats(aggregateData); - } else { - StringColumnStatsData aggregateData = null; - for (ColumnStatistics cs : css) { - ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); - StringColumnStatsData newData = cso.getStatsData().getStringStats(); - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData - .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); - aggregateData - .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); - } - } - columnStatisticsData.setStringStats(aggregateData); - } - statsObj.setStatsData(columnStatisticsData); - return statsObj; - } - -} diff --git a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java index 2967a60fae..27fbdd30ba 100644 --- a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java +++ b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java @@ -48,6 +48,7 @@ private String decimalHighValue; private Long numNulls; private Long numDVs; + private String bitVector; private Double avgColLen; private Long maxColLen; private Long numTrues; @@ -166,31 +167,35 @@ public void setBooleanStats(Long numTrues, Long numFalses, Long numNulls) { this.numNulls = numNulls; } - public void setLongStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setLongStats(Long numNulls, Long numNDVs, String bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } - public void setDoubleStats(Long numNulls, Long numNDVs, Double lowValue, Double highValue) { + public void setDoubleStats(Long numNulls, Long numNDVs, String bitVector, Double lowValue, Double highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.doubleLowValue = lowValue; this.doubleHighValue = highValue; } public void setDecimalStats( - Long numNulls, Long numNDVs, String lowValue, String highValue) { + Long numNulls, Long numNDVs, String bitVector, String lowValue, String highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.decimalLowValue = lowValue; this.decimalHighValue = highValue; } - public void setStringStats(Long numNulls, Long numNDVs, Long maxColLen, Double avgColLen) { + public void setStringStats(Long numNulls, Long numNDVs, String bitVector, Long maxColLen, Double avgColLen) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.maxColLen = maxColLen; this.avgColLen = avgColLen; } @@ -201,9 +206,10 @@ public void setBinaryStats(Long numNulls, Long maxColLen, Double avgColLen) { this.avgColLen = avgColLen; } - public void setDateStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setDateStats(Long numNulls, Long numNDVs, String bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } @@ -255,4 +261,12 @@ public String getDecimalHighValue() { public void setDecimalHighValue(String decimalHighValue) { this.decimalHighValue = decimalHighValue; } + + public String getBitVector() { + return bitVector; + } + + public void setBitVector(String bitVector) { + this.bitVector = bitVector; + } } diff --git a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java index 132f7a137b..755087618b 100644 --- a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java +++ b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java @@ -46,6 +46,7 @@ private String decimalHighValue; private Long numNulls; private Long numDVs; + private String bitVector; private Double avgColLen; private Long maxColLen; private Long numTrues; @@ -156,31 +157,35 @@ public void setBooleanStats(Long numTrues, Long numFalses, Long numNulls) { this.numNulls = numNulls; } - public void setLongStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setLongStats(Long numNulls, Long numNDVs, String bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } - public void setDoubleStats(Long numNulls, Long numNDVs, Double lowValue, Double highValue) { + public void setDoubleStats(Long numNulls, Long numNDVs, String bitVector, Double lowValue, Double highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.doubleLowValue = lowValue; this.doubleHighValue = highValue; } public void setDecimalStats( - Long numNulls, Long numNDVs, String lowValue, String highValue) { + Long numNulls, Long numNDVs, String bitVector, String lowValue, String highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.decimalLowValue = lowValue; this.decimalHighValue = highValue; } - public void setStringStats(Long numNulls, Long numNDVs, Long maxColLen, Double avgColLen) { + public void setStringStats(Long numNulls, Long numNDVs, String bitVector, Long maxColLen, Double avgColLen) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.maxColLen = maxColLen; this.avgColLen = avgColLen; } @@ -191,9 +196,10 @@ public void setBinaryStats(Long numNulls, Long maxColLen, Double avgColLen) { this.avgColLen = avgColLen; } - public void setDateStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setDateStats(Long numNulls, Long numNDVs, String bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } @@ -246,4 +252,12 @@ public String getDecimalHighValue() { public void setDecimalHighValue(String decimalHighValue) { this.decimalHighValue = decimalHighValue; } + + public String getBitVector() { + return bitVector; + } + + public void setBitVector(String bitVector) { + this.bitVector = bitVector; + } } diff --git a/metastore/src/model/package.jdo b/metastore/src/model/package.jdo index 9c4bc219f2..3d759c7764 100644 --- a/metastore/src/model/package.jdo +++ b/metastore/src/model/package.jdo @@ -879,6 +879,9 @@ + + + @@ -943,6 +946,9 @@ + + + diff --git a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsExtrapolation.java b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsExtrapolation.java index 99ce96ca0d..4d868b0146 100644 --- a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsExtrapolation.java +++ b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsExtrapolation.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.StatObjectConverter; import org.apache.hadoop.hive.metastore.api.AggrStats; @@ -62,8 +63,7 @@ SortedMap rows = new TreeMap<>(); // NDV will be 3 for the bitVectors - String bitVectors = "{0, 4, 5, 7}{0, 1}{0, 1, 2}{0, 1, 4}{0}{0, 2}{0, 3}{0, 2, 3, 4}{0, 1, 4}{0, 1}{0}{0, 1, 3, 8}{0, 2}{0, 2}{0, 9}{0, 1, 4}"; - + String bitVectors = null; @Before public void before() throws IOException { MockitoAnnotations.initMocks(this); @@ -71,6 +71,11 @@ public void before() throws IOException { conf.setBoolean(HBaseReadWrite.NO_CACHE_CONF, true); store = MockUtils.init(conf, htable, rows); store.backdoor().getStatsCache().resetCounters(); + HyperLogLog hll = HyperLogLog.builder().build(); + hll.addLong(1); + hll.addLong(2); + hll.addLong(3); + bitVectors = hll.serialize(); } private static interface Checker { @@ -395,7 +400,7 @@ public void noPartitionsHaveBitVectorStatus() throws Exception { dcsd.setHighValue(1000 + i); dcsd.setLowValue(-1000 - i); dcsd.setNumNulls(i); - dcsd.setNumDVs(10 * i); + dcsd.setNumDVs(i == 0 ? 1 : 10 * i); data.setLongStats(dcsd); obj.setStatsData(data); cs.addToStatsObj(obj); diff --git a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java index 74e16695a9..0ad27806d1 100644 --- a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java +++ b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.StatObjectConverter; import org.apache.hadoop.hive.metastore.api.AggrStats; @@ -61,9 +62,8 @@ SortedMap rows = new TreeMap<>(); // NDV will be 3 for bitVectors[0] and 1 for bitVectors[1] - String bitVectors[] = { - "{0, 4, 5, 7}{0, 1}{0, 1, 2}{0, 1, 4}{0}{0, 2}{0, 3}{0, 2, 3, 4}{0, 1, 4}{0, 1}{0}{0, 1, 3, 8}{0, 2}{0, 2}{0, 9}{0, 1, 4}", - "{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}" }; + String bitVectors[] = new String[2]; + @Before public void before() throws IOException { @@ -73,6 +73,15 @@ public void before() throws IOException { conf.setBoolean(HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION.varname, true); store = MockUtils.init(conf, htable, rows); store.backdoor().getStatsCache().resetCounters(); + HyperLogLog hll = HyperLogLog.builder().build(); + hll.addLong(1); + bitVectors[1] = hll.serialize(); + hll = HyperLogLog.builder().build(); + hll.addLong(2); + hll.addLong(3); + hll.addLong(3); + hll.addLong(4); + bitVectors[0] = hll.serialize(); } private static interface Checker { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java index 97bf839ae1..16c440fc61 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java @@ -3396,7 +3396,7 @@ private int describeTable(Hive db, DescTableDesc descTbl) throws HiveException, ColStatistics.Range r = cs.getRange(); StatObjectConverter.fillColumnStatisticsData(partCol.getType(), data, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue.toString(), r == null ? null : r.maxValue.toString(), - cs.getNumNulls(), cs.getCountDistint(), cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses()); + cs.getNumNulls(), cs.getCountDistint(), null, cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses()); ColumnStatisticsObj cso = new ColumnStatisticsObj(partCol.getName(), partCol.getType(), data); colStats = Collections.singletonList(cso); StatsSetupConst.setColumnStatsState(tblProps, colNames); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java index aa77234c28..464f0b7cae 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/formatting/MetaDataFormatUtils.java @@ -695,38 +695,40 @@ private static void formatWithIndentation(String colName, String colType, String ColumnStatisticsData csd = cso.getStatsData(); if (csd.isSetBinaryStats()) { BinaryColumnStatsData bcsd = csd.getBinaryStats(); - appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", bcsd.getAvgColLen(), + appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", "", bcsd.getAvgColLen(), bcsd.getMaxColLen(), "", ""); } else if (csd.isSetStringStats()) { StringColumnStatsData scsd = csd.getStringStats(); appendColumnStats(tableInfo, "", "", scsd.getNumNulls(), scsd.getNumDVs(), - scsd.getAvgColLen(), scsd.getMaxColLen(), "", ""); + scsd.getBitVectors() == null ? "" : scsd.getBitVectors(), scsd.getAvgColLen(), + scsd.getMaxColLen(), "", ""); } else if (csd.isSetBooleanStats()) { BooleanColumnStatsData bcsd = csd.getBooleanStats(); - appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", "", "", + appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", "", "", "", bcsd.getNumTrues(), bcsd.getNumFalses()); } else if (csd.isSetDecimalStats()) { DecimalColumnStatsData dcsd = csd.getDecimalStats(); appendColumnStats(tableInfo, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), + dcsd.getBitVectors() == null ? "" : dcsd.getBitVectors(), "", "", "", ""); } else if (csd.isSetDoubleStats()) { DoubleColumnStatsData dcsd = csd.getDoubleStats(); appendColumnStats(tableInfo, dcsd.getLowValue(), dcsd.getHighValue(), dcsd.getNumNulls(), - dcsd.getNumDVs(), "", "", "", ""); + dcsd.getNumDVs(), dcsd.getBitVectors() == null ? "" : dcsd.getBitVectors(), "", "", "", ""); } else if (csd.isSetLongStats()) { LongColumnStatsData lcsd = csd.getLongStats(); appendColumnStats(tableInfo, lcsd.getLowValue(), lcsd.getHighValue(), lcsd.getNumNulls(), - lcsd.getNumDVs(), "", "", "", ""); + lcsd.getNumDVs(), lcsd.getBitVectors() == null ? "" : lcsd.getBitVectors(), "", "", "", ""); } else if (csd.isSetDateStats()) { DateColumnStatsData dcsd = csd.getDateStats(); appendColumnStats(tableInfo, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), - dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", ""); + dcsd.getNumNulls(), dcsd.getNumDVs(), dcsd.getBitVectors() == null ? "" : dcsd.getBitVectors(), "", "", "", ""); } } else { - appendColumnStats(tableInfo, "", "", "", "", "", "", "", ""); + appendColumnStats(tableInfo, "", "", "", "", "", "", "", "", ""); } } @@ -779,11 +781,12 @@ private static void printPadding(StringBuilder tableInfo, int[] columnWidths) { } private static void appendColumnStats(StringBuilder sb, Object min, Object max, Object numNulls, - Object ndv, Object avgColLen, Object maxColLen, Object numTrues, Object numFalses) { + Object ndv, Object bitVector, Object avgColLen, Object maxColLen, Object numTrues, Object numFalses) { sb.append(String.format("%-" + ALIGNMENT + "s", min)).append(FIELD_DELIM); sb.append(String.format("%-" + ALIGNMENT + "s", max)).append(FIELD_DELIM); sb.append(String.format("%-" + ALIGNMENT + "s", numNulls)).append(FIELD_DELIM); sb.append(String.format("%-" + ALIGNMENT + "s", ndv)).append(FIELD_DELIM); + sb.append(String.format("%-" + ALIGNMENT + "s", bitVector)).append(FIELD_DELIM); sb.append(String.format("%-" + ALIGNMENT + "s", avgColLen)).append(FIELD_DELIM); sb.append(String.format("%-" + ALIGNMENT + "s", maxColLen)).append(FIELD_DELIM); sb.append(String.format("%-" + ALIGNMENT + "s", numTrues)).append(FIELD_DELIM); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java index 41a1c7a582..f2d2e2dc0b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java @@ -18,9 +18,6 @@ package org.apache.hadoop.hive.ql.plan; -import org.apache.hadoop.hive.ql.stats.StatsUtils; - - public class ColStatistics { private String colName; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/DescTableDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/DescTableDesc.java index d7a9888389..c413d16126 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/DescTableDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/DescTableDesc.java @@ -59,8 +59,8 @@ public void setPartSpec(Map partSpec) { */ private static final String schema = "col_name,data_type,comment#string:string:string"; private static final String colStatsSchema = "col_name,data_type,min,max,num_nulls," - + "distinct_count,avg_col_len,max_col_len,num_trues,num_falses,comment" - + "#string:string:string:string:string:string:string:string:string:string:string"; + + "distinct_count,bitVector,avg_col_len,max_col_len,num_trues,num_falses,comment" + + "#string:string:string:string:string:string:string:string:string:string:string:string"; public DescTableDesc() { } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java index 2d56950cb1..8ee41bfab2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java @@ -23,9 +23,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.ndv.FMSketch; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.apache.hadoop.hive.common.ndv.fm.FMSketch; import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/ql/src/test/queries/clientpositive/bitvector.q b/ql/src/test/queries/clientpositive/bitvector.q new file mode 100644 index 0000000000..d8669f254b --- /dev/null +++ b/ql/src/test/queries/clientpositive/bitvector.q @@ -0,0 +1,3 @@ +set hive.mapred.mode=nonstrict; + +desc formatted src key; diff --git a/ql/src/test/queries/clientpositive/fm-sketch.q b/ql/src/test/queries/clientpositive/fm-sketch.q new file mode 100644 index 0000000000..6a65442076 --- /dev/null +++ b/ql/src/test/queries/clientpositive/fm-sketch.q @@ -0,0 +1,58 @@ +set hive.mapred.mode=nonstrict; +set hive.stats.ndv.algo=fm; + +create table n(key int); + +insert overwrite table n select null from src; + +explain analyze table n compute statistics for columns; + +analyze table n compute statistics for columns; + +desc formatted n key; + + +create table i(key int); + +insert overwrite table i select key from src; + +explain analyze table i compute statistics for columns; + +analyze table i compute statistics for columns; + +desc formatted i key; + +drop table i; + +create table i(key double); + +insert overwrite table i select key from src; + +analyze table i compute statistics for columns; + +desc formatted i key; + +drop table i; + +create table i(key decimal); + +insert overwrite table i select key from src; + +analyze table i compute statistics for columns; + +desc formatted i key; + +drop table i; + +create table i(key date); + +insert into i values ('2012-08-17'); +insert into i values ('2012-08-17'); +insert into i values ('2013-08-17'); +insert into i values ('2012-03-17'); +insert into i values ('2012-05-17'); + +analyze table i compute statistics for columns; + +desc formatted i key; + diff --git a/ql/src/test/queries/clientpositive/hll.q b/ql/src/test/queries/clientpositive/hll.q index edfdce8a29..91c4e788d3 100644 --- a/ql/src/test/queries/clientpositive/hll.q +++ b/ql/src/test/queries/clientpositive/hll.q @@ -1,5 +1,16 @@ set hive.mapred.mode=nonstrict; +create table n(key int); + +insert overwrite table n select null from src; + +explain analyze table n compute statistics for columns; + +analyze table n compute statistics for columns; + +desc formatted n key; + + create table i(key int); insert overwrite table i select key from src; diff --git a/ql/src/test/results/clientpositive/bitvector.q.out b/ql/src/test/results/clientpositive/bitvector.q.out new file mode 100644 index 0000000000..31d6a5afc9 --- /dev/null +++ b/ql/src/test/results/clientpositive/bitvector.q.out @@ -0,0 +1,31 @@ +PREHOOK: query: desc formatted src key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@src +POSTHOOK: query: desc formatted src key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@src +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key string 0 309 SExM4LUCtQLBoliC580Fv5Nq/8NRwJtoxdPYB/yjmAO/kokBgcjcBMHblgH/lecCgMn9AcC78gm/ +wzeA/BHB6MgH/5KjAsKL1QqAge0D/obCAoCvMMGIDoDPDr/ovgSAn6ALgL7TEMGr8wX/6xnAuqgF +wcgagNoPwY+GAf7ouATEjJwBvIfKAoGetgS/ysoIwN4dwMHtA8DAnQmB7TWBspEFv+7uBMGB4wL+ +mmvAqtEBgrrQAYGlkQb9j1CBuYcE/5VDgNciweGkA4Hf9wS+ws4BgPdxgJr/BcCctQOB2rQD/+LE +CMDf/QHAklbAxaoBgP7gA4Gu1weBx4YB/remBMHjToDh2wH/+sEHwMqxA8DvyATBrUj/7ecCgP0H +ifmTBIC8FriHfv/5pArAlSzAnIoBwJ/bAsXdU7v+0g7ArvQBgOX+AoD5+hPA/4oHgK3rAcCWB4D+ +iwSA/soDgu71Ab68MYD/gQKAh+ECwN6xBMCTYoSh4AO9nESArIcGwYmeA/7TpAWBncQDwthzvrb9 +BIG2hQG+lbgIgan2DP/2R4Cu1giAlJcDg7UU/ZBfgchL/4O8AsDO8gKBodUGwMTcA8OKggP88sYD +w9tCvr+CA7/W3QfE05wB/dWUCcCa2QmBsY4HgrD3BbzCwguAih3B67gD/9zeAYX0twf7048HwLKS +CICJC8GyrgGB0akBvr2mAcDP8QGA0/ECwMScA4GXrAGBpokFgKgKwNSHBP7sMIDfkAbBpzy/pdcK +wLnfAYLeKP6R5wLCssoBv8JY/7IvwNEMgq+7AsDQH/6VwAGA8nXCisoBvvevAoH4PMKOqAO+lnj/ +yjnCwzL/7IADv/8jhK28Brzx1wGCmk6/0t0CgIyaAv/CnwXAzZ0BwJadA4GCKcC53AG/sUCBhs8J +gZmRAb6zwwSBgLYGweuIAb+tbsH6gwPAngH+ysQBhLxh/NihAsGYswS/l8MKgtFX/u6jA8XB6AL8 +3tAC/5TDBMHMvgG/0NICwO79BYS82Qa8oMQBwPqBBcHhI//N2RLAitsEgJnuBcD+qAXA/t0FwJ32 +CoDBrAHDlSD9ltIDwMieBIC26QLBzZkGgbecAYCvCf+Xgwn/40+JgRX3xNcBwIkLweaQAYDuggKB +2okBwPESvtvPBsHxpQL/ucsCwIeJA8CjBoH6SP+UnwKCw8IGv7mcBYHtywKChYICvKDpAsDx5gHC ++MIF/sDeBYCh1ALA+poCwYygCL+TTITO3AK9weMEwf+fAb/V5AKApKMGg8Fg/J9OwfnNCv/pkgLC +z+gEvt+XA4CqlgGAlOQDgIaICYGmzAGCiYUHv96HBv/njQHEzfsEv9LuB7zk1gPD4RiDo/UDu5qr +A//IIcDDHsCwDYDozwLEq9EBvZp1gLwawJaGAv/JwAPC/JcDvqvcAcG1ugODzpMBvZfmAoDWB//s +1wKHzaYCucW2BcL9uQT/+aMB/+WtAsKPN7+sdsTghAG7t6kEwNw+wKLUAsCDIIHcjwGAqpACgM36 +BcC//AOAtLEEgMbwAf+mwQiAqfgH + 2.812 3 from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} diff --git a/ql/src/test/results/clientpositive/extrapolate_part_stats_full.q.out b/ql/src/test/results/clientpositive/extrapolate_part_stats_full.q.out index b212da907b..eff8774841 100644 --- a/ql/src/test/results/clientpositive/extrapolate_part_stats_full.q.out +++ b/ql/src/test/results/clientpositive/extrapolate_part_stats_full.q.out @@ -81,9 +81,10 @@ PREHOOK: Input: default@loc_orc_1d POSTHOOK: query: describe formatted loc_orc_1d PARTITION(year='2001') state POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@loc_orc_1d -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -state string 0 3 0.75 2 from deserializer +# col_name data_type min max num_nulls distinct_count bitVector avg_col_len max_col_len num_trues num_falses comment + +state string 0 3 SExM4AMDgaTbFcD8mOYCwMOJoQQ= + 0.75 2 from deserializer PREHOOK: query: explain extended select state from loc_orc_1d PREHOOK: type: QUERY POSTHOOK: query: explain extended select state from loc_orc_1d diff --git a/ql/src/test/results/clientpositive/extrapolate_part_stats_partial.q.out b/ql/src/test/results/clientpositive/extrapolate_part_stats_partial.q.out index b5f4feede0..48ee0759b8 100644 --- a/ql/src/test/results/clientpositive/extrapolate_part_stats_partial.q.out +++ b/ql/src/test/results/clientpositive/extrapolate_part_stats_partial.q.out @@ -89,18 +89,20 @@ PREHOOK: Input: default@loc_orc_1d POSTHOOK: query: describe formatted loc_orc_1d PARTITION(year='2001') state POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@loc_orc_1d -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -state string 0 3 0.75 2 from deserializer +# col_name data_type min max num_nulls distinct_count bitVector avg_col_len max_col_len num_trues num_falses comment + +state string 0 3 SExM4AMDgaTbFcD8mOYCwMOJoQQ= + 0.75 2 from deserializer PREHOOK: query: describe formatted loc_orc_1d PARTITION(year='2002') state PREHOOK: type: DESCTABLE PREHOOK: Input: default@loc_orc_1d POSTHOOK: query: describe formatted loc_orc_1d PARTITION(year='2002') state POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@loc_orc_1d -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -state string 0 6 3.0 3 from deserializer +# col_name data_type min max num_nulls distinct_count bitVector avg_col_len max_col_len num_trues num_falses comment + +state string 0 6 SExM4AYGhJ2RPL68foHA90C/kJJjgJX39QKAwfg7 + 3.0 3 from deserializer PREHOOK: query: explain extended select state from loc_orc_1d PREHOOK: type: QUERY POSTHOOK: query: explain extended select state from loc_orc_1d @@ -296,12 +298,12 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc_1d - Statistics: Num rows: 20 Data size: 1780 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 20 Data size: 1700 Basic stats: COMPLETE Column stats: PARTIAL GatherStats: false Select Operator expressions: state (type: string) outputColumnNames: _col0 - Statistics: Num rows: 20 Data size: 1780 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 20 Data size: 1700 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: explain extended select state,locid from loc_orc_1d @@ -499,12 +501,12 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc_1d - Statistics: Num rows: 20 Data size: 1860 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 20 Data size: 1780 Basic stats: COMPLETE Column stats: PARTIAL GatherStats: false Select Operator expressions: state (type: string), locid (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 20 Data size: 1860 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 20 Data size: 1780 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: analyze table loc_orc_1d partition(year='2000') compute statistics for columns state diff --git a/ql/src/test/results/clientpositive/fm-sketch.q.out b/ql/src/test/results/clientpositive/fm-sketch.q.out new file mode 100644 index 0000000000..b859abe44e --- /dev/null +++ b/ql/src/test/results/clientpositive/fm-sketch.q.out @@ -0,0 +1,333 @@ +PREHOOK: query: create table n(key int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@n +POSTHOOK: query: create table n(key int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@n +PREHOOK: query: insert overwrite table n select null from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@n +POSTHOOK: query: insert overwrite table n select null from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@n +POSTHOOK: Lineage: n.key EXPRESSION [] +PREHOOK: query: explain analyze table n compute statistics for columns +PREHOOK: type: QUERY +POSTHOOK: query: explain analyze table n compute statistics for columns +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: n + Statistics: Num rows: 500 Data size: 1000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 1000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: compute_stats(key, 'fm', 16) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 480 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 480 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: key + Column Types: int + Table: default.n + +PREHOOK: query: analyze table n compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@n +#### A masked pattern was here #### +POSTHOOK: query: analyze table n compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +#### A masked pattern was here #### +PREHOOK: query: desc formatted n key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@n +POSTHOOK: query: desc formatted n key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@n +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 0 0 500 1 Rk0QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAA= + from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} +PREHOOK: query: create table i(key int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@i +POSTHOOK: query: create table i(key int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@i +PREHOOK: query: insert overwrite table i select key from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@i +POSTHOOK: query: insert overwrite table i select key from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: explain analyze table i compute statistics for columns +PREHOOK: type: QUERY +POSTHOOK: query: explain analyze table i compute statistics for columns +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: i + Statistics: Num rows: 500 Data size: 1406 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 1406 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: compute_stats(key, 'fm', 16) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 480 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 480 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: key + Column Types: int + Table: default.i + +PREHOOK: query: analyze table i compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@i +#### A masked pattern was here #### +POSTHOOK: query: analyze table i compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@i +#### A masked pattern was here #### +PREHOOK: query: desc formatted i key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@i +POSTHOOK: query: desc formatted i key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@i +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 0 498 0 196 Rk0QAP8YAAB/AAAA/woAAP8AAAC/AQAA/wEAAH8BAAD/AgAAfwAAAPsLAAB/AgAA/wgAAH9DAAA/ +AAAA/xQAAP8DAAA= + from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} +PREHOOK: query: drop table i +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@i +PREHOOK: Output: default@i +POSTHOOK: query: drop table i +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@i +POSTHOOK: Output: default@i +PREHOOK: query: create table i(key double) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@i +POSTHOOK: query: create table i(key double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@i +PREHOOK: query: insert overwrite table i select key from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@i +POSTHOOK: query: insert overwrite table i select key from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: analyze table i compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@i +#### A masked pattern was here #### +POSTHOOK: query: analyze table i compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@i +#### A masked pattern was here #### +PREHOOK: query: desc formatted i key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@i +POSTHOOK: query: desc formatted i key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@i +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key double 0.0 498.0 0 234 Rk0QAP8AAAD/AQAA/wAAAJ8NAAB/MAAA/xEAAP8CAAD/AgAAfwIAAP8AAAB/EQAA/wAAAP8AAAB/ +AAAA3wEAAP8CAAA= + from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} +PREHOOK: query: drop table i +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@i +PREHOOK: Output: default@i +POSTHOOK: query: drop table i +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@i +POSTHOOK: Output: default@i +PREHOOK: query: create table i(key decimal) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@i +POSTHOOK: query: create table i(key decimal) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@i +PREHOOK: query: insert overwrite table i select key from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@i +POSTHOOK: query: insert overwrite table i select key from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: analyze table i compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@i +#### A masked pattern was here #### +POSTHOOK: query: analyze table i compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@i +#### A masked pattern was here #### +PREHOOK: query: desc formatted i key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@i +POSTHOOK: query: desc formatted i key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@i +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key decimal(10,0) 0 498 0 180 Rk0QAP8AAAD/AwAA/wUAAP8DAAD/AwAAvwIAAH8eAAC/AQAAPwAAAL8AAAAHAAAAvwAAAP0CAAD/ +AQAA/wMAAH8CAAA= + from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} +PREHOOK: query: drop table i +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@i +PREHOOK: Output: default@i +POSTHOOK: query: drop table i +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@i +POSTHOOK: Output: default@i +PREHOOK: query: create table i(key date) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@i +POSTHOOK: query: create table i(key date) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@i +PREHOOK: query: insert into i values ('2012-08-17') +PREHOOK: type: QUERY +PREHOOK: Output: default@i +POSTHOOK: query: insert into i values ('2012-08-17') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into i values ('2012-08-17') +PREHOOK: type: QUERY +PREHOOK: Output: default@i +POSTHOOK: query: insert into i values ('2012-08-17') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into i values ('2013-08-17') +PREHOOK: type: QUERY +PREHOOK: Output: default@i +POSTHOOK: query: insert into i values ('2013-08-17') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into i values ('2012-03-17') +PREHOOK: type: QUERY +PREHOOK: Output: default@i +POSTHOOK: query: insert into i values ('2012-03-17') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into i values ('2012-05-17') +PREHOOK: type: QUERY +PREHOOK: Output: default@i +POSTHOOK: query: insert into i values ('2012-05-17') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@i +POSTHOOK: Lineage: i.key EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: analyze table i compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@i +#### A masked pattern was here #### +POSTHOOK: query: analyze table i compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@i +#### A masked pattern was here #### +PREHOOK: query: desc formatted i key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@i +POSTHOOK: query: desc formatted i key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@i +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key date 2012-03-17 2013-08-17 0 3 Rk0QAAEAAAAGAAAAAwAAAA0AAAADAAAABwAAAAsAAAAJAAAAEwAAAAkAAAADAAAABwAAAAMAAAAB +AAAABAAAAAUAAAA= + from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} diff --git a/ql/src/test/results/clientpositive/hll.q.out b/ql/src/test/results/clientpositive/hll.q.out index b9357c3043..301f1c7296 100644 --- a/ql/src/test/results/clientpositive/hll.q.out +++ b/ql/src/test/results/clientpositive/hll.q.out @@ -1,3 +1,88 @@ +PREHOOK: query: create table n(key int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@n +POSTHOOK: query: create table n(key int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@n +PREHOOK: query: insert overwrite table n select null from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@n +POSTHOOK: query: insert overwrite table n select null from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@n +POSTHOOK: Lineage: n.key EXPRESSION [] +PREHOOK: query: explain analyze table n compute statistics for columns +PREHOOK: type: QUERY +POSTHOOK: query: explain analyze table n compute statistics for columns +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: n + Statistics: Num rows: 500 Data size: 1000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 1000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: compute_stats(key, 'hll') + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 480 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 480 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: key + Column Types: int + Table: default.n + +PREHOOK: query: analyze table n compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@n +#### A masked pattern was here #### +POSTHOOK: query: analyze table n compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +#### A masked pattern was here #### +PREHOOK: query: desc formatted n key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@n +POSTHOOK: query: desc formatted n key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@n +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 0 0 500 1 SExM4AEA + from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} PREHOOK: query: create table i(key int) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default @@ -80,7 +165,28 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@i # col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment -key int 0 498 0 309 from deserializer +key int 0 498 0 309 SExM4LUCtQKN6yH2ofgFwu2cAvzblwLAqoIDwf6+AcDkmgT/i5kBxOPKA72MowGA5fAbg4LgAr2L +vQH/+x+A0gOA0fsBgefUBb+gkhPB+03A7aUCg+BdwKUTvbaQA4LIeL7O3QTCo6IDvbuRAsCRFoGb +YL/lO4LfuQK+4acBgLePCsHvrQHD43u87s4EwP0QwJJtyL23ArjLvQPCz7wG/5yDC7/N4AKCxMcO +vvCNA4HI4wP/6rEDgIIJwaZOv+cwgaevAf+GzQHA14ICwPz+BcDb+gKAhg+H7RS67okB/5HHAoOo +Nb2V2wWA6fAIwJSODsCT9gGAiY8Bg/xI/bq4CoDXkgHAyvYGgOduwJKLAYKbvgH+2bQBwNCWBoK7 +Gb+fmQO/6J8Bgb89/9fzAsCPywLAp/wHgbJeg/z8Ar25kQGA4P0Dv/OUA4CgoQaBpL4EwDuC+m69 +yDWA/BLApYELxMEv/I1LgYWwBMbTlAP56cMEgZssgeiCAb+kowHBvf0CgYHSAf3g0QaBi9sC/9yi +AoDbIYHllQnAhAGBqJkFvrBKgZmZDIKEogG9slWC7qgF/q5DwM30DoKHRcCN7wO+ir0DwLOtAcDy +8wKB4L0Dv/HEA4adpAOAqxr6kkyA14EIwbkUgIihCIGfcoCODr/z5wKAs/QBw7JvvLnQBMHmsgL/ +1UTAy5gCgbHaAf+UpgOAjO0HwcRQhOePAYChCruLvQaBtSj/osUBwoK1AYGn+Qm9kLcDgLSoAYCQ +2QeAv54FwoIavsJ2wYYL/9jbAoCTjgGBjDX/ztkBgPF8gtNC/r2PAoGgUcHDcb+LqAe/laoBwsOe +A8D6EsDQkAT+0tcGgIRzgIqQAYCT+gXB7wv/jvQMw4miBr3LvgTA0YYBgKCTCIHyxQHBtPcCvts+ +we3HAsD/9gG/zaEDgMiqA8H6iQHAniPCiIQB/bucAYDykQGCodED/o+VAsa89gO6pqAHwKvqAYGu +9QO/0bgPwLiEAcH7lwHA4v4FgMUrwe9k/v9ggaI5wbniAr7lOYP3tAH9vmXBxscCwPDuAYCkFoPc +6QaCoOUH/MSUAr/4gwmAw4wIv/rBAsCH2QGEl1n86qQBgOWcEoLOsgb+k74EhNjFAbyX2QHAi4MB +gJiCAYHyiwnAvYgC/5LkB4HnoQLA46QU/6+SBsGv6QHBut4Evo/iA8KzFL7b0AKAwJkJwZSRAb+g +4gHBux+B/58F/+D2Av/5tgKAmieA4MsBwrvkBMDIBb77GoCqnwjA3PkBgPOTCMD9e8P8tgK91poD +gIGeAcH3nQKAhqIEv6LdA4DK2AKClCm+mc4BxoVo+rCiAoDfoAKAtPoFwdCUAsHtpwH+j8QBwYWl +Ab+00gOAy9gMgfHAA7/hvwTAqCeCsUq/yUj/t9wCxYPOArvNrQTAq5ADwJrZCcKbX764IcHS1QKA +t+kLwtSlC/3wyweAl2bAhKEDwLXQCYDXhQXBpeICgcpm//3nBoDmGMG7lwH/y+YI//XaAYHTlQKA +4gPA7aoC/6mKCIDZpgLDoEQ= + from deserializer COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} PREHOOK: query: drop table i PREHOOK: type: DROPTABLE @@ -123,7 +229,28 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@i # col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment -key double 0.0 498.0 0 309 from deserializer +key double 0.0 498.0 0 309 SExM4LUCtQLB60iBzkW/t98FgZmoAoClIL/zvgGA98wFwZxDgNEev/fQAoTVmQS8tMUCgIfzAsap +7wL7vdICwMCHAv/a2gGAj+MJwMSpBsHPLcGehw++nboDgY6bAb+Z3Q2CycMBvtO0A8DE9QOCh+sE +v4/3AoHde7++1QKB1ZUFvvHBAYCumwGAm4sBwOjEAoD4LsDB1APA0rEEwo6RAb665QaCysQBvua+ +BMGxF8C4qASAqtwLhZqTAbqmvAHChlWBuNsJ/fzNAoCjiwSA7f0BgMHaBIDtjQ3B9zLCwa0F/8ba +AYPTogK7xaMGgNT7BIOAQr7ZogH//r4Bwb88xcts+7xwv97zAYPPhAG+mJkBgeabAb6jFcDZmgmA +wWiB6NQG/6y2AcDn7QLB2OMBv7PADMK6K7+obMCrqwrBnPUE/qNQwaacDsCm9gG/wdUCwauuAf/I +yQGCgtkB/r+qAcAgwNvzD8C0GoTuwAS87xqAx7wJwc+QAYDolQL/164BwJ+VA8K4Hf/DjwG//ecB +wNTFEMCugwLA3CyBopEBv4fdAoOhSb6DjQPDjm79gI0B//ixBoCFA4HvpwGAx4sC/7KuD4DKGoPk +qwH9g8YBwI95wMCYAsD0lAHAhViGgt0F+7hQv4bMAsCKEICDmQHE06oD/J1FgJBfgOapA8PJlQf9 +itwEwM2GC4P8pgi997EDgP2aAsCzhgTAj5ADw5HBA77hdb/IL8LNxQT+zzOA4aQBgf+gBf/SWMDl +iAWF/r4BgPwD/KhS/5M+wIP+AcBkgPCBA4CxjAGAwgHEgFz/9vYLvZuZA8CHqgPDjJwBvddOwJua +BMCpUYKEvgS+pVaA9PoEg6osvZ1Gga/IAv/9wQLA94EDwOwigO1tgfadAcO4f7yA/ATAq94BwP2X +A4CahgLA64ECwOzUCoPq5wP9s5cBwNkJgISgAcCN7gLB0bMFgOyLCMDTlA7Bzm2/rYoBgbv8Av+G +esGE3gKAuSu+8YwFxK+9BICqLv70iAq/z1vB2oQDv790gZOXA8DxhQi+3r0Ewe2+AsGpfL7JtgGB +sdgHgt+IAb3riwKA/xqAx4YBwM6BBMD24QeE/sgEvM3RAsD/4QHA9KUBg9/PBr7xxgaB0aUD//aC +A8D0gxSB19wEvtOyCcDBmQGC9q4BvqHgCYDEbMGnaoHK2QT/j5kDv+w7gutQgP3zC/6+kgSAsh3B +xkC/ybsCgYq4Ab+iS8LN2wK/3dUEgMGICMHQ9wK+ucQCgJvyAofd9Ai5wzbC3LcFwrjwAf78jgq+ +xiPBgzO/0myEya8E/OKkAsHYPcHfqQP/ndwCwNH/BcOngAG8/d8Egd5S/+khgr+zEICIJ4bv0AH4 +isQCgN6lAsTolwO88EDA56UEwsSgDf7U4gHDpUa9570DweyNAb/LyQfA/PwGga7MA8Db7QGBpYEB +vqNhwNSNBMCL3AHBqzu/gGXAweUCgIqDAoCBdYLHyAbAaL/rgAWA9e4RgMwTv76yAoDZDcHd1wGA +tucFgd6SE8DhBr+JUQ== + from deserializer COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} PREHOOK: query: drop table i PREHOOK: type: DROPTABLE @@ -166,7 +293,28 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@i # col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment -key decimal(10,0) 0 498 0 309 from deserializer +key decimal(10,0) 0 498 0 309 SExM4LUCtQLB60iBzkW/t98FgZmoAoClIL/zvgGA98wFwZxDgNEev/fQAoTVmQS8tMUCgIfzAsap +7wL7vdICwMCHAv/a2gGAj+MJwMSpBsHPLcGehw++nboDgY6bAb+Z3Q2CycMBvtO0A8DE9QOCh+sE +v4/3AoHde7++1QKB1ZUFvvHBAYCumwGAm4sBwOjEAoD4LsDB1APA0rEEwo6RAb665QaCysQBvua+ +BMGxF8C4qASAqtwLhZqTAbqmvAHChlWBuNsJ/fzNAoCjiwSA7f0BgMHaBIDtjQ3B9zLCwa0F/8ba +AYPTogK7xaMGgNT7BIOAQr7ZogH//r4Bwb88xcts+7xwv97zAYPPhAG+mJkBgeabAb6jFcDZmgmA +wWiB6NQG/6y2AcDn7QLB2OMBv7PADMK6K7+obMCrqwrBnPUE/qNQwaacDsCm9gG/wdUCwauuAf/I +yQGCgtkB/r+qAcAgwNvzD8C0GoTuwAS87xqAx7wJwc+QAYDolQL/164BwJ+VA8K4Hf/DjwG//ecB +wNTFEMCugwLA3CyBopEBv4fdAoOhSb6DjQPDjm79gI0B//ixBoCFA4HvpwGAx4sC/7KuD4DKGoPk +qwH9g8YBwI95wMCYAsD0lAHAhViGgt0F+7hQv4bMAsCKEICDmQHE06oD/J1FgJBfgOapA8PJlQf9 +itwEwM2GC4P8pgi997EDgP2aAsCzhgTAj5ADw5HBA77hdb/IL8LNxQT+zzOA4aQBgf+gBf/SWMDl +iAWF/r4BgPwD/KhS/5M+wIP+AcBkgPCBA4CxjAGAwgHEgFz/9vYLvZuZA8CHqgPDjJwBvddOwJua +BMCpUYKEvgS+pVaA9PoEg6osvZ1Gga/IAv/9wQLA94EDwOwigO1tgfadAcO4f7yA/ATAq94BwP2X +A4CahgLA64ECwOzUCoPq5wP9s5cBwNkJgISgAcCN7gLB0bMFgOyLCMDTlA7Bzm2/rYoBgbv8Av+G +esGE3gKAuSu+8YwFxK+9BICqLv70iAq/z1vB2oQDv790gZOXA8DxhQi+3r0Ewe2+AsGpfL7JtgGB +sdgHgt+IAb3riwKA/xqAx4YBwM6BBMD24QeE/sgEvM3RAsD/4QHA9KUBg9/PBr7xxgaB0aUD//aC +A8D0gxSB19wEvtOyCcDBmQGC9q4BvqHgCYDEbMGnaoHK2QT/j5kDv+w7gutQgP3zC/6+kgSAsh3B +xkC/ybsCgYq4Ab+iS8LN2wK/3dUEgMGICMHQ9wK+ucQCgJvyAofd9Ai5wzbC3LcFwrjwAf78jgq+ +xiPBgzO/0myEya8E/OKkAsHYPcHfqQP/ndwCwNH/BcOngAG8/d8Egd5S/+khgr+zEICIJ4bv0AH4 +isQCgN6lAsTolwO88EDA56UEwsSgDf7U4gHDpUa9570DweyNAb/LyQfA/PwGga7MA8Db7QGBpYEB +vqNhwNSNBMCL3AHBqzu/gGXAweUCgIqDAoCBdYLHyAbAaL/rgAWA9e4RgMwTv76yAoDZDcHd1wGA +tucFgd6SE8DhBr+JUQ== + from deserializer COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} PREHOOK: query: drop table i PREHOOK: type: DROPTABLE @@ -235,5 +383,6 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@i # col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment -key date 2012-03-17 2013-08-17 0 4 from deserializer +key date 2012-03-17 2013-08-17 0 4 SExM4AQEgZ3gM4Gdw13A3/qtA4L855QD + from deserializer COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}}