diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 2b8280e..87ee45a 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -167,6 +167,7 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HiveConf.ConfVars.HIVE_TXN_MANAGER, HiveConf.ConfVars.HIVE_TXN_TIMEOUT, HiveConf.ConfVars.HIVE_TXN_MAX_OPEN_BATCH, + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION }; /** @@ -1259,6 +1260,8 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0, "Standard error expressed in percentage. Provides a tradeoff between accuracy and compute cost. \n" + "A lower value for error indicates higher accuracy and a higher compute cost."), + HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION("hive.metastore.stats.ndv.densityfunction", false, + "Whether to use density function to estimate the NDV for the whole table based on the NDV of partitions"), HIVE_STATS_KEY_PREFIX_MAX_LENGTH("hive.stats.key.prefix.max.length", 150, "Determines if when the prefix of the key used for intermediate stats collection\n" + "exceeds a certain length, a hash of the key is used instead. If the value < 0 then hashing"), diff --git a/data/files/extrapolate_stats_partial_ndv.txt b/data/files/extrapolate_stats_partial_ndv.txt new file mode 100644 index 0000000..9af6d30 --- /dev/null +++ b/data/files/extrapolate_stats_partial_ndv.txt @@ -0,0 +1,20 @@ +|1|1.0E3|94087|2000 +O|2|1.01E3|94086|2000 +|1|0.01E3|94087|2001 +H|2|2.0E3|94086|2001 +|3|1.0E3|94086|2001 +OH|4|1.01E3|43201|2001 +oh1|1|1.0E2|94087|2002 +OH2|2|9.0E2|43201|2002 +oh3|3|1.0E2|94087|2002 +OH4|4|9.1E2|94086|2002 +oh5|4|9.0E2|43201|2002 +OH6|5|0.01E3|94087|2002 +|31|1.0E3|94087|2003 +OH33|1|1.01E3|43201|2003 +|3|2.0E3|94087|2003 +OH|1|1.0E3|94086|2003 +|4|2.0E3|43201|2003 +OH|1|1.0E3|94087|2003 +|1|2.0E3|43201|2003 +OH|5|1.0E3|94086|2003 diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java index 74f1b01..4859cff 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java @@ -25,23 +25,33 @@ /** * The sequence of colStatNames. */ - static String[] colStatNames = new String[] { "LONG_LOW_VALUE", - "LONG_HIGH_VALUE", "DOUBLE_LOW_VALUE", "DOUBLE_HIGH_VALUE", - "BIG_DECIMAL_LOW_VALUE", "BIG_DECIMAL_HIGH_VALUE", "NUM_NULLS", - "NUM_DISTINCTS", "AVG_COL_LEN", "MAX_COL_LEN", "NUM_TRUES", "NUM_FALSES" }; - + static String[] colStatNames = new String[] { "LONG_LOW_VALUE", "LONG_HIGH_VALUE", + "DOUBLE_LOW_VALUE", "DOUBLE_HIGH_VALUE", "BIG_DECIMAL_LOW_VALUE", "BIG_DECIMAL_HIGH_VALUE", + "NUM_NULLS", "NUM_DISTINCTS", "AVG_COL_LEN", "MAX_COL_LEN", "NUM_TRUES", "NUM_FALSES", + "AVG_NDV_LONG", "AVG_NDV_DOUBLE", "AVG_NDV_DECIMAL", "SUM_NUM_DISTINCTS" }; + /** * The indexes for colstats. */ - static HashMap indexMaps = new HashMap(){{ - put("long", new Integer [] {0,1,6,7}); - put("double", new Integer [] {2,3,6,7}); - put("string", new Integer [] {8,9,6,7}); - put("boolean", new Integer [] {10,11,6}); - put("binary", new Integer [] {8,9,6}); - put("decimal", new Integer [] {4,5,6,7}); - put("default", new Integer [] {0,1,2,3,4,5,6,7,8,9,10,11}); -}}; + static HashMap indexMaps = new HashMap() { + { + put("bigint", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("int", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("smallint", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("tinyint", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("timestamp", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("long", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("double", new Integer[] { 2, 3, 6, 7, 13, 15 }); + put("float", new Integer[] { 2, 3, 6, 7, 13, 15 }); + put("varchar", new Integer[] { 8, 9, 6, 7, 15 }); + put("char", new Integer[] { 8, 9, 6, 7, 15 }); + put("string", new Integer[] { 8, 9, 6, 7, 15 }); + put("boolean", new Integer[] { 10, 11, 6, 15 }); + put("binary", new Integer[] { 8, 9, 6, 15 }); + put("decimal", new Integer[] { 4, 5, 6, 7, 14, 15 }); + put("default", new Integer[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15 }); + } + }; /** * The sequence of colStatTypes. @@ -50,23 +60,24 @@ Long, Double, Decimal } - static ColStatType[] colStatTypes = new ColStatType[] { ColStatType.Long, - ColStatType.Long, ColStatType.Double, ColStatType.Double, - ColStatType.Decimal, ColStatType.Decimal, ColStatType.Long, - ColStatType.Long, ColStatType.Double, ColStatType.Long, ColStatType.Long, + static ColStatType[] colStatTypes = new ColStatType[] { ColStatType.Long, ColStatType.Long, + ColStatType.Double, ColStatType.Double, ColStatType.Decimal, ColStatType.Decimal, + ColStatType.Long, ColStatType.Long, ColStatType.Double, ColStatType.Long, ColStatType.Long, + ColStatType.Long, ColStatType.Double, ColStatType.Double, ColStatType.Double, ColStatType.Long }; /** * The sequence of aggregation function on colStats. */ static enum AggrType { - Min, Max, Sum + Min, Max, Sum, Avg } - static AggrType[] aggrTypes = new AggrType[] { AggrType.Min, AggrType.Max, - AggrType.Min, AggrType.Max, AggrType.Min, AggrType.Max, AggrType.Sum, - AggrType.Max, AggrType.Max, AggrType.Max, AggrType.Sum, AggrType.Sum }; - + static AggrType[] aggrTypes = new AggrType[] { AggrType.Min, AggrType.Max, AggrType.Min, + AggrType.Max, AggrType.Min, AggrType.Max, AggrType.Sum, AggrType.Max, AggrType.Max, + AggrType.Max, AggrType.Sum, AggrType.Sum, AggrType.Avg, AggrType.Avg, AggrType.Avg, + AggrType.Sum }; + public Object extrapolate(Object[] min, Object[] max, int colStatIndex, Map indexMap); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java index 7fc04f1..f4e5ef7 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/LinearExtrapolatePartStatus.java @@ -19,11 +19,8 @@ package org.apache.hadoop.hive.metastore; import java.math.BigDecimal; -import java.nio.ByteBuffer; import java.util.Map; -import org.apache.hadoop.hive.metastore.api.Decimal; - public class LinearExtrapolatePartStatus implements IExtrapolatePartStatus { @Override @@ -35,6 +32,15 @@ public Object extrapolate(Object[] min, Object[] max, int colStatIndex, if (minInd == maxInd) { return min[0]; } + //note that recent metastore stores decimal in string. + double decimalmin= 0; + double decimalmax = 0; + if (colStatTypes[colStatIndex] == ColStatType.Decimal) { + BigDecimal bdmin = new BigDecimal(min[0].toString()); + decimalmin = bdmin.doubleValue(); + BigDecimal bdmax = new BigDecimal(max[0].toString()); + decimalmax = bdmax.doubleValue(); + } if (aggrTypes[colStatIndex] == AggrType.Max) { if (minInd < maxInd) { // right border is the max @@ -45,15 +51,9 @@ public Object extrapolate(Object[] min, Object[] max, int colStatIndex, return (Double) ((Double) min[0] + (((Double) max[0] - (Double) min[0]) * (rightBorderInd - minInd) / (maxInd - minInd))); } else { - Decimal dmax = (Decimal) max[0]; - BigDecimal bdmax = new BigDecimal(dmax.toString()); - double doublemax = bdmax.doubleValue(); - Decimal dmin = (Decimal) min[0]; - BigDecimal bdmin = new BigDecimal(dmin.toString()); - double doublemin = bdmin.doubleValue(); - double ret = doublemin + (doublemax - doublemin) + double ret = decimalmin + (decimalmax - decimalmin) * (rightBorderInd - minInd) / (maxInd - minInd); - return createThriftDecimal(String.valueOf(ret)); + return String.valueOf(ret); } } else { // left border is the max @@ -62,17 +62,11 @@ public Object extrapolate(Object[] min, Object[] max, int colStatIndex, * minInd / (minInd - maxInd)); } else if (colStatTypes[colStatIndex] == ColStatType.Double) { return (Double) ((Double) min[0] + ((Double) max[0] - (Double) min[0]) - * minInd / (maxInd - minInd)); + * minInd / (minInd - maxInd)); } else { - Decimal dmax = (Decimal) max[0]; - BigDecimal bdmax = new BigDecimal(dmax.toString()); - double doublemax = bdmax.doubleValue(); - Decimal dmin = (Decimal) min[0]; - BigDecimal bdmin = new BigDecimal(dmin.toString()); - double doublemin = bdmin.doubleValue(); - double ret = doublemin + (doublemax - doublemin) * minInd - / (maxInd - minInd); - return createThriftDecimal(String.valueOf(ret)); + double ret = decimalmin + (decimalmax - decimalmin) * minInd + / (minInd - maxInd); + return String.valueOf(ret); } } } else { @@ -87,16 +81,9 @@ public Object extrapolate(Object[] min, Object[] max, int colStatIndex, * maxInd / (maxInd - minInd); return ret; } else { - Decimal dmax = (Decimal) max[0]; - BigDecimal bdmax = new BigDecimal(dmax.toString()); - double doublemax = bdmax.doubleValue(); - Decimal dmin = (Decimal) min[0]; - BigDecimal bdmin = new BigDecimal(dmin.toString()); - double doublemin = bdmin.doubleValue(); - double ret = doublemax - (doublemax - doublemin) * maxInd + double ret = decimalmax - (decimalmax - decimalmin) * maxInd / (maxInd - minInd); - return createThriftDecimal(String.valueOf(ret)); - + return String.valueOf(ret); } } else { // right border is the min @@ -109,24 +96,11 @@ public Object extrapolate(Object[] min, Object[] max, int colStatIndex, * (rightBorderInd - maxInd) / (minInd - maxInd); return ret; } else { - Decimal dmax = (Decimal) max[0]; - BigDecimal bdmax = new BigDecimal(dmax.toString()); - double doublemax = bdmax.doubleValue(); - Decimal dmin = (Decimal) min[0]; - BigDecimal bdmin = new BigDecimal(dmin.toString()); - double doublemin = bdmin.doubleValue(); - double ret = doublemax - (doublemax - doublemin) + double ret = decimalmax - (decimalmax - decimalmin) * (rightBorderInd - maxInd) / (minInd - maxInd); - return createThriftDecimal(String.valueOf(ret)); + return String.valueOf(ret); } } } } - - private static Decimal createThriftDecimal(String s) { - BigDecimal d = new BigDecimal(s); - return new Decimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), - (short) d.scale()); - } - } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index ba27f10..bf169c9 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -803,6 +803,15 @@ private String extractSqlString(Object value) { if (value == null) return null; return value.toString(); } + + static Double extractSqlDouble(Object obj) throws MetaException { + if (obj == null) + return null; + if (!(obj instanceof Number)) { + throw new MetaException("Expected numeric type but got " + obj.getClass().getName()); + } + return ((Number) obj).doubleValue(); + } private static String trimCommaList(StringBuilder sb) { if (sb.length() > 0) { @@ -1082,10 +1091,13 @@ public ColumnStatistics getTableStats( } public AggrStats aggrColStatsForPartitions(String dbName, String tableName, - List partNames, List colNames) throws MetaException { + List partNames, List colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException { long partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); List stats = columnStatisticsObjForPartitions(dbName, - tableName, partNames, colNames, partsFound); + tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation); + LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + + Arrays.toString(stats.toArray())); return new AggrStats(stats, partsFound); } @@ -1114,15 +1126,33 @@ private long partsFoundForPartitions(String dbName, String tableName, return partsFound; } - private List columnStatisticsObjForPartitions( - String dbName, String tableName, List partNames, - List colNames, long partsFound) throws MetaException { + private List columnStatisticsObjForPartitions(String dbName, + String tableName, List partNames, List colNames, long partsFound, boolean useDensityFunctionForNDVEstimation) + throws MetaException { // TODO: all the extrapolation logic should be moved out of this class, - // only mechanical data retrieval should remain here. + // only mechanical data retrieval should remain here. String commonPrefix = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", " + "min(\"LONG_LOW_VALUE\"), max(\"LONG_HIGH_VALUE\"), min(\"DOUBLE_LOW_VALUE\"), max(\"DOUBLE_HIGH_VALUE\"), " - + "min(\"BIG_DECIMAL_LOW_VALUE\"), max(\"BIG_DECIMAL_HIGH_VALUE\"), sum(\"NUM_NULLS\"), max(\"NUM_DISTINCTS\"), " - + "max(\"AVG_COL_LEN\"), max(\"MAX_COL_LEN\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\") from \"PART_COL_STATS\"" + + "min(cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal)), max(cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)), " + + "sum(\"NUM_NULLS\"), max(\"NUM_DISTINCTS\"), " + + "max(\"AVG_COL_LEN\"), max(\"MAX_COL_LEN\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), " + // The following data is used to compute a partitioned table's NDV based + // on partitions' NDV when useDensityFunctionForNDVEstimation = true. Global NDVs cannot be + // accurately derived from partition NDVs, because the domain of column value two partitions + // can overlap. If there is no overlap then global NDV is just the sum + // of partition NDVs (UpperBound). But if there is some overlay then + // global NDV can be anywhere between sum of partition NDVs (no overlap) + // and same as one of the partition NDV (domain of column value in all other + // partitions is subset of the domain value in one of the partition) + // (LowerBound).But under uniform distribution, we can roughly estimate the global + // NDV by leveraging the min/max values. + // And, we also guarantee that the estimation makes sense by comparing it to the + // UpperBound (calculated by "sum(\"NUM_DISTINCTS\")") + // and LowerBound (calculated by "max(\"NUM_DISTINCTS\")") + + "avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as decimal))," + + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," + + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")," + + "sum(\"NUM_DISTINCTS\")" + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? "; String queryText = null; long start = 0; @@ -1134,14 +1164,13 @@ private long partsFoundForPartitions(String dbName, String tableName, // Check if the status of all the columns of all the partitions exists // Extrapolation is not needed. if (partsFound == partNames.size()) { - queryText = commonPrefix - + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" + queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, prepareParams( - dbName, tableName, partNames, colNames), queryText); + qResult = executeWithArray(query, prepareParams(dbName, tableName, partNames, colNames), + queryText); if (qResult == null) { query.closeAll(); return Lists.newArrayList(); @@ -1149,10 +1178,9 @@ private long partsFoundForPartitions(String dbName, String tableName, end = doTrace ? System.nanoTime() : 0; timingTrace(doTrace, queryText, start, end); List list = ensureList(qResult); - List colStats = new ArrayList( - list.size()); + List colStats = new ArrayList(list.size()); for (Object[] row : list) { - colStats.add(prepareCSObj(row, 0)); + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); Deadline.checkTimeout(); } query.closeAll(); @@ -1161,18 +1189,16 @@ private long partsFoundForPartitions(String dbName, String tableName, // Extrapolation is needed for some columns. // In this case, at least a column status for a partition is missing. // We need to extrapolate this partition based on the other partitions - List colStats = new ArrayList( - colNames.size()); + List colStats = new ArrayList(colNames.size()); queryText = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", count(\"PARTITION_NAME\") " - + " from \"PART_COL_STATS\"" - + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, prepareParams( - dbName, tableName, partNames, colNames), queryText); + qResult = executeWithArray(query, prepareParams(dbName, tableName, partNames, colNames), + queryText); end = doTrace ? System.nanoTime() : 0; timingTrace(doTrace, queryText, start, end); if (qResult == null) { @@ -1200,21 +1226,20 @@ private long partsFoundForPartitions(String dbName, String tableName, query.closeAll(); // Extrapolation is not needed for columns noExtraColumnNames if (noExtraColumnNames.size() != 0) { - queryText = commonPrefix - + " and \"COLUMN_NAME\" in ("+ makeParams(noExtraColumnNames.size()) + ")" - + " and \"PARTITION_NAME\" in ("+ makeParams(partNames.size()) +")" - + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; + queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + + makeParams(noExtraColumnNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, prepareParams( - dbName, tableName, partNames, noExtraColumnNames), queryText); + qResult = executeWithArray(query, + prepareParams(dbName, tableName, partNames, noExtraColumnNames), queryText); if (qResult == null) { query.closeAll(); return Lists.newArrayList(); } list = ensureList(qResult); for (Object[] row : list) { - colStats.add(prepareCSObj(row, 0)); + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); Deadline.checkTimeout(); } end = doTrace ? System.nanoTime() : 0; @@ -1230,38 +1255,42 @@ private long partsFoundForPartitions(String dbName, String tableName, } // get sum for all columns to reduce the number of queries Map> sumMap = new HashMap>(); - queryText = "select \"COLUMN_NAME\", sum(\"NUM_NULLS\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\")" + queryText = "select \"COLUMN_NAME\", sum(\"NUM_NULLS\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), sum(\"NUM_DISTINCTS\")" + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " - + " and \"COLUMN_NAME\" in (" +makeParams(extraColumnNameTypeParts.size())+ ")" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + " and \"COLUMN_NAME\" in (" + + makeParams(extraColumnNameTypeParts.size()) + + ")" + + " and \"PARTITION_NAME\" in (" + + makeParams(partNames.size()) + + ")" + " group by \"COLUMN_NAME\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); List extraColumnNames = new ArrayList(); extraColumnNames.addAll(extraColumnNameTypeParts.keySet()); - qResult = executeWithArray(query, prepareParams( - dbName, tableName, partNames, extraColumnNames), queryText); + qResult = executeWithArray(query, + prepareParams(dbName, tableName, partNames, extraColumnNames), queryText); if (qResult == null) { query.closeAll(); return Lists.newArrayList(); } list = ensureList(qResult); // see the indexes for colstats in IExtrapolatePartStatus - Integer[] sumIndex = new Integer[] { 6, 10, 11 }; + Integer[] sumIndex = new Integer[] { 6, 10, 11, 15 }; for (Object[] row : list) { Map indexToObject = new HashMap(); for (int ind = 1; ind < row.length; ind++) { indexToObject.put(sumIndex[ind - 1], row[ind]); } + // row[0] is the column name sumMap.put((String) row[0], indexToObject); Deadline.checkTimeout(); } end = doTrace ? System.nanoTime() : 0; timingTrace(doTrace, queryText, start, end); query.closeAll(); - for (Map.Entry entry : extraColumnNameTypeParts - .entrySet()) { + for (Map.Entry entry : extraColumnNameTypeParts.entrySet()) { Object[] row = new Object[IExtrapolatePartStatus.colStatNames.length + 2]; String colName = entry.getKey(); String colType = entry.getValue()[0]; @@ -1270,12 +1299,20 @@ private long partsFoundForPartitions(String dbName, String tableName, row[0] = colName; // fill in coltype row[1] = colType; - // use linear extrapolation. more complicated one can be added in the future. + // use linear extrapolation. more complicated one can be added in the + // future. IExtrapolatePartStatus extrapolateMethod = new LinearExtrapolatePartStatus(); // fill in colstatus - Integer[] index = IExtrapolatePartStatus.indexMaps.get(colType - .toLowerCase()); - //if the colType is not the known type, long, double, etc, then get all index. + Integer[] index = null; + boolean decimal = false; + if (colType.toLowerCase().startsWith("decimal")) { + index = IExtrapolatePartStatus.indexMaps.get("decimal"); + decimal = true; + } else { + index = IExtrapolatePartStatus.indexMaps.get(colType.toLowerCase()); + } + // if the colType is not the known type, long, double, etc, then get + // all index. if (index == null) { index = IExtrapolatePartStatus.indexMaps.get("default"); } @@ -1290,20 +1327,27 @@ private long partsFoundForPartitions(String dbName, String tableName, Long val = extractSqlLong(o); row[2 + colStatIndex] = (Long) (val / sumVal * (partNames.size())); } - } else { + } else if (IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Min + || IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Max) { // if the aggregation type is min/max, we extrapolate from the // left/right borders - queryText = "select \"" - + colStatName - + "\",\"PARTITION_NAME\" from \"PART_COL_STATS\"" - + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" - + " and \"COLUMN_NAME\" = ?" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" - + " order by \'" + colStatName + "\'"; + if (!decimal) { + queryText = "select \"" + colStatName + + "\",\"PARTITION_NAME\" from \"PART_COL_STATS\"" + + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" + + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + " order by \"" + colStatName + "\""; + } else { + queryText = "select \"" + colStatName + + "\",\"PARTITION_NAME\" from \"PART_COL_STATS\"" + + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" + + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + " order by cast(\"" + colStatName + "\" as decimal)"; + } start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); - qResult = executeWithArray(query, prepareParams( - dbName, tableName, partNames, Arrays.asList(colName)), queryText); + qResult = executeWithArray(query, + prepareParams(dbName, tableName, partNames, Arrays.asList(colName)), queryText); if (qResult == null) { query.closeAll(); return Lists.newArrayList(); @@ -1317,12 +1361,38 @@ private long partsFoundForPartitions(String dbName, String tableName, if (min[0] == null || max[0] == null) { row[2 + colStatIndex] = null; } else { - row[2 + colStatIndex] = extrapolateMethod.extrapolate(min, max, - colStatIndex, indexMap); + row[2 + colStatIndex] = extrapolateMethod.extrapolate(min, max, colStatIndex, + indexMap); + } + } else { + // if the aggregation type is avg, we use the average on the + // existing ones. + queryText = "select " + + "avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as decimal))," + + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," + + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")" + + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + + " and \"COLUMN_NAME\" = ?" + " and \"PARTITION_NAME\" in (" + + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\""; + start = doTrace ? System.nanoTime() : 0; + query = pm.newQuery("javax.jdo.query.SQL", queryText); + qResult = executeWithArray(query, + prepareParams(dbName, tableName, partNames, Arrays.asList(colName)), queryText); + if (qResult == null) { + query.closeAll(); + return Lists.newArrayList(); } + fqr = (ForwardQueryResult) qResult; + Object[] avg = (Object[]) (fqr.get(0)); + // colStatIndex=12,13,14 respond to "AVG_LONG", "AVG_DOUBLE", + // "AVG_DECIMAL" + row[2 + colStatIndex] = avg[colStatIndex - 12]; + end = doTrace ? System.nanoTime() : 0; + timingTrace(doTrace, queryText, start, end); + query.closeAll(); } } - colStats.add(prepareCSObj(row, 0)); + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); Deadline.checkTimeout(); } } @@ -1341,6 +1411,17 @@ private ColumnStatisticsObj prepareCSObj (Object[] row, int i) throws MetaExcept return cso; } + private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, + boolean useDensityFunctionForNDVEstimation) throws MetaException { + ColumnStatisticsData data = new ColumnStatisticsData(); + ColumnStatisticsObj cso = new ColumnStatisticsObj((String) row[i++], (String) row[i++], data); + Object llow = row[i++], lhigh = row[i++], dlow = row[i++], dhigh = row[i++], declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], avglen = row[i++], maxlen = row[i++], trues = row[i++], falses = row[i++], avgLong = row[i++], avgDouble = row[i++], avgDecimal = row[i++], sumDist = row[i++]; + StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data, llow, lhigh, dlow, dhigh, + declow, dechigh, nulls, dist, avglen, maxlen, trues, falses, avgLong, avgDouble, + avgDecimal, sumDist, useDensityFunctionForNDVEstimation); + return cso; + } + private Object[] prepareParams(String dbName, String tableName, List partNames, List colNames) throws MetaException { diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java index 75005aa..a8c02b4 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -6111,12 +6111,13 @@ protected ColumnStatistics getJdoResult( @Override public AggrStats get_aggr_stats_for(String dbName, String tblName, final List partNames, final List colNames) throws MetaException, NoSuchObjectException { + final boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(), HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); return new GetHelper(dbName, tblName, true, false) { @Override protected AggrStats getSqlResult(GetHelper ctx) throws MetaException { return directSql.aggrColStatsForPartitions(dbName, tblName, partNames, - colNames); + colNames, useDensityFunctionForNDVEstimation); } @Override protected AggrStats getJdoResult(GetHelper ctx) diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java index 475883b..86adf16 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java @@ -476,6 +476,133 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData } } + public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, + Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, + Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, + Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist, + boolean useDensityFunctionForNDVEstimation) throws MetaException { + colType = colType.toLowerCase(); + if (colType.equals("boolean")) { + BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); + boolStats.setNumFalses(MetaStoreDirectSql.extractSqlLong(falses)); + boolStats.setNumTrues(MetaStoreDirectSql.extractSqlLong(trues)); + boolStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + data.setBooleanStats(boolStats); + } else if (colType.equals("string") || colType.startsWith("varchar") + || colType.startsWith("char")) { + StringColumnStatsData stringStats = new StringColumnStatsData(); + stringStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + stringStats.setAvgColLen((Double) avglen); + stringStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen)); + stringStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + data.setStringStats(stringStats); + } else if (colType.equals("binary")) { + BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); + binaryStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + binaryStats.setAvgColLen((Double) avglen); + binaryStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen)); + data.setBinaryStats(binaryStats); + } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") + || colType.equals("tinyint") || colType.equals("timestamp")) { + LongColumnStatsData longStats = new LongColumnStatsData(); + longStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + if (lhigh != null) { + longStats.setHighValue(MetaStoreDirectSql.extractSqlLong(lhigh)); + } + if (llow != null) { + longStats.setLowValue(MetaStoreDirectSql.extractSqlLong(llow)); + } + long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); + long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); + if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null + && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) { + // We have estimation, lowerbound and higherbound. We use estimation if + // it is between lowerbound and higherbound. + long estimation = MetaStoreDirectSql + .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql + .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong)); + if (estimation < lowerBound) { + longStats.setNumDVs(lowerBound); + } else if (estimation > higherBound) { + longStats.setNumDVs(higherBound); + } else { + longStats.setNumDVs(estimation); + } + } else { + longStats.setNumDVs(lowerBound); + } + data.setLongStats(longStats); + } else if (colType.equals("double") || colType.equals("float")) { + DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); + doubleStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + if (dhigh != null) { + doubleStats.setHighValue((Double) dhigh); + } + if (dlow != null) { + doubleStats.setLowValue((Double) dlow); + } + long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); + long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); + if (useDensityFunctionForNDVEstimation && dhigh != null && dlow != null && avgDouble != null + && MetaStoreDirectSql.extractSqlDouble(avgDouble) != 0.0) { + long estimation = MetaStoreDirectSql + .extractSqlLong((MetaStoreDirectSql.extractSqlLong(dhigh) - MetaStoreDirectSql + .extractSqlLong(dlow)) / MetaStoreDirectSql.extractSqlDouble(avgDouble)); + if (estimation < lowerBound) { + doubleStats.setNumDVs(lowerBound); + } else if (estimation > higherBound) { + doubleStats.setNumDVs(higherBound); + } else { + doubleStats.setNumDVs(estimation); + } + } else { + doubleStats.setNumDVs(lowerBound); + } + data.setDoubleStats(doubleStats); + } else if (colType.startsWith("decimal")) { + DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); + decimalStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + Decimal low = null; + Decimal high = null; + BigDecimal blow = null; + BigDecimal bhigh = null; + if (dechigh instanceof BigDecimal) { + bhigh = (BigDecimal) dechigh; + high = new Decimal(ByteBuffer.wrap(bhigh.unscaledValue().toByteArray()), + (short) bhigh.scale()); + } else if (dechigh instanceof String) { + bhigh = new BigDecimal((String) dechigh); + high = createThriftDecimal((String) dechigh); + } + decimalStats.setHighValue(high); + if (declow instanceof BigDecimal) { + blow = (BigDecimal) declow; + low = new Decimal(ByteBuffer.wrap(blow.unscaledValue().toByteArray()), (short) blow.scale()); + } else if (dechigh instanceof String) { + blow = new BigDecimal((String) declow); + low = createThriftDecimal((String) declow); + } + decimalStats.setLowValue(low); + long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); + long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); + if (useDensityFunctionForNDVEstimation && dechigh != null && declow != null && avgDecimal != null + && MetaStoreDirectSql.extractSqlDouble(avgDecimal) != 0.0) { + long estimation = MetaStoreDirectSql.extractSqlLong(MetaStoreDirectSql.extractSqlLong(bhigh + .subtract(blow).floatValue() / MetaStoreDirectSql.extractSqlDouble(avgDecimal))); + if (estimation < lowerBound) { + decimalStats.setNumDVs(lowerBound); + } else if (estimation > higherBound) { + decimalStats.setNumDVs(higherBound); + } else { + decimalStats.setNumDVs(estimation); + } + } else { + decimalStats.setNumDVs(lowerBound); + } + data.setDecimalStats(decimalStats); + } + } + private static Decimal createThriftDecimal(String s) { BigDecimal d = new BigDecimal(s); return new Decimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short)d.scale()); @@ -484,4 +611,5 @@ private static Decimal createThriftDecimal(String s) { private static String createJdoDecimalString(Decimal d) { return new BigDecimal(new BigInteger(d.getUnscaled()), d.getScale()).toString(); } + } diff --git a/ql/src/test/queries/clientpositive/extrapolate_part_stats_partial_ndv.q b/ql/src/test/queries/clientpositive/extrapolate_part_stats_partial_ndv.q new file mode 100644 index 0000000..b7fc4e3 --- /dev/null +++ b/ql/src/test/queries/clientpositive/extrapolate_part_stats_partial_ndv.q @@ -0,0 +1,99 @@ +set hive.metastore.stats.ndv.densityfunction=true; +set hive.stats.fetch.column.stats=true; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; + +drop table if exists ext_loc; + +create table ext_loc ( + state string, + locid double, + cnt decimal, + zip int, + year string +) row format delimited fields terminated by '|' stored as textfile; + +LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_partial_ndv.txt' OVERWRITE INTO TABLE ext_loc; + +drop table if exists loc_orc_1d; + +create table loc_orc_1d ( + state string, + locid double, + cnt decimal, + zip int +) partitioned by(year string) stored as orc; + +insert overwrite table loc_orc_1d partition(year) select * from ext_loc; + +analyze table loc_orc_1d partition(year='2001') compute statistics for columns state,locid,cnt,zip; + +analyze table loc_orc_1d partition(year='2002') compute statistics for columns state,locid,cnt,zip; + +describe formatted loc_orc_1d.state PARTITION(year='2001'); + +describe formatted loc_orc_1d.state PARTITION(year='2002'); + +describe formatted loc_orc_1d.locid PARTITION(year='2001'); + +describe formatted loc_orc_1d.locid PARTITION(year='2002'); + +describe formatted loc_orc_1d.cnt PARTITION(year='2001'); + +describe formatted loc_orc_1d.cnt PARTITION(year='2002'); + +describe formatted loc_orc_1d.zip PARTITION(year='2001'); + +describe formatted loc_orc_1d.zip PARTITION(year='2002'); + +explain extended select state,locid,cnt,zip from loc_orc_1d; + +analyze table loc_orc_1d partition(year='2000') compute statistics for columns state,locid,cnt,zip; + +analyze table loc_orc_1d partition(year='2003') compute statistics for columns state,locid,cnt,zip; + +describe formatted loc_orc_1d.state PARTITION(year='2000'); + +describe formatted loc_orc_1d.state PARTITION(year='2003'); + +describe formatted loc_orc_1d.locid PARTITION(year='2000'); + +describe formatted loc_orc_1d.locid PARTITION(year='2003'); + +describe formatted loc_orc_1d.cnt PARTITION(year='2000'); + +describe formatted loc_orc_1d.cnt PARTITION(year='2003'); + +describe formatted loc_orc_1d.zip PARTITION(year='2000'); + +describe formatted loc_orc_1d.zip PARTITION(year='2003'); + +explain extended select state,locid,cnt,zip from loc_orc_1d; + +drop table if exists loc_orc_2d; + +create table loc_orc_2d ( + state string, + locid int, + cnt decimal +) partitioned by(zip int, year string) stored as orc; + +insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc; + +analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns state,locid,cnt; + +analyze table loc_orc_2d partition(zip=94087, year='2002') compute statistics for columns state,locid,cnt; + +describe formatted loc_orc_2d.state partition(zip=94086, year='2001'); + +describe formatted loc_orc_2d.state partition(zip=94087, year='2002'); + +describe formatted loc_orc_2d.locid partition(zip=94086, year='2001'); + +describe formatted loc_orc_2d.locid partition(zip=94087, year='2002'); + +describe formatted loc_orc_2d.cnt partition(zip=94086, year='2001'); + +describe formatted loc_orc_2d.cnt partition(zip=94087, year='2002'); + +explain extended select state,locid,cnt,zip from loc_orc_2d; diff --git a/ql/src/test/results/clientpositive/extrapolate_part_stats_partial_ndv.q.out b/ql/src/test/results/clientpositive/extrapolate_part_stats_partial_ndv.q.out new file mode 100644 index 0000000..1d30504 --- /dev/null +++ b/ql/src/test/results/clientpositive/extrapolate_part_stats_partial_ndv.q.out @@ -0,0 +1,1385 @@ +PREHOOK: query: drop table if exists ext_loc +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists ext_loc +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table ext_loc ( + state string, + locid double, + cnt decimal, + zip int, + year string +) row format delimited fields terminated by '|' stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@ext_loc +POSTHOOK: query: create table ext_loc ( + state string, + locid double, + cnt decimal, + zip int, + year string +) row format delimited fields terminated by '|' stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ext_loc +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_partial_ndv.txt' OVERWRITE INTO TABLE ext_loc +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@ext_loc +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_partial_ndv.txt' OVERWRITE INTO TABLE ext_loc +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@ext_loc +PREHOOK: query: drop table if exists loc_orc_1d +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists loc_orc_1d +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table loc_orc_1d ( + state string, + locid double, + cnt decimal, + zip int +) partitioned by(year string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@loc_orc_1d +POSTHOOK: query: create table loc_orc_1d ( + state string, + locid double, + cnt decimal, + zip int +) partitioned by(year string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@loc_orc_1d +PREHOOK: query: insert overwrite table loc_orc_1d partition(year) select * from ext_loc +PREHOOK: type: QUERY +PREHOOK: Input: default@ext_loc +PREHOOK: Output: default@loc_orc_1d +POSTHOOK: query: insert overwrite table loc_orc_1d partition(year) select * from ext_loc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ext_loc +POSTHOOK: Output: default@loc_orc_1d@year=2000 +POSTHOOK: Output: default@loc_orc_1d@year=2001 +POSTHOOK: Output: default@loc_orc_1d@year=2002 +POSTHOOK: Output: default@loc_orc_1d@year=2003 +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2002).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2002).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2002).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2002).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2003).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2003).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2003).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2003).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip, type:int, comment:null), ] +PREHOOK: query: analyze table loc_orc_1d partition(year='2001') compute statistics for columns state,locid,cnt,zip +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_1d +PREHOOK: Input: default@loc_orc_1d@year=2001 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_1d partition(year='2001') compute statistics for columns state,locid,cnt,zip +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_1d +POSTHOOK: Input: default@loc_orc_1d@year=2001 +#### A masked pattern was here #### +PREHOOK: query: analyze table loc_orc_1d partition(year='2002') compute statistics for columns state,locid,cnt,zip +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_1d +PREHOOK: Input: default@loc_orc_1d@year=2002 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_1d partition(year='2002') compute statistics for columns state,locid,cnt,zip +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_1d +POSTHOOK: Input: default@loc_orc_1d@year=2002 +#### A masked pattern was here #### +PREHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +state string 0 3 0.75 2 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +state string 0 6 3.0 3 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid double 1.0 4.0 0 5 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid double 1.0 5.0 0 6 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +cnt decimal(10,0) 10 2000 0 5 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +cnt decimal(10,0) 10 910 0 4 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +zip int 43201 94087 0 4 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +zip int 43201 94087 0 4 from deserializer +PREHOOK: query: explain extended select state,locid,cnt,zip from loc_orc_1d +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select state,locid,cnt,zip from loc_orc_1d +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + loc_orc_1d + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + state + TOK_SELEXPR + TOK_TABLE_OR_COL + locid + TOK_SELEXPR + TOK_TABLE_OR_COL + cnt + TOK_SELEXPR + TOK_TABLE_OR_COL + zip + + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2000 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 2 + partition_columns year + partition_columns.types string + rawDataSize 416 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 536 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 4 + partition_columns year + partition_columns.types string + rawDataSize 832 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 570 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2002 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 6 + partition_columns year + partition_columns.types string + rawDataSize 1266 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 586 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2003 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 8 + partition_columns year + partition_columns.types string + rawDataSize 1672 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 610 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Processor Tree: + TableScan + alias: loc_orc_1d + Statistics: Num rows: 20 Data size: 4186 Basic stats: COMPLETE Column stats: PARTIAL + GatherStats: false + Select Operator + expressions: state (type: string), locid (type: double), cnt (type: decimal(10,0)), zip (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 20 Data size: 4260 Basic stats: COMPLETE Column stats: PARTIAL + ListSink + +PREHOOK: query: analyze table loc_orc_1d partition(year='2000') compute statistics for columns state,locid,cnt,zip +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_1d +PREHOOK: Input: default@loc_orc_1d@year=2000 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_1d partition(year='2000') compute statistics for columns state,locid,cnt,zip +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_1d +POSTHOOK: Input: default@loc_orc_1d@year=2000 +#### A masked pattern was here #### +PREHOOK: query: analyze table loc_orc_1d partition(year='2003') compute statistics for columns state,locid,cnt,zip +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_1d +PREHOOK: Input: default@loc_orc_1d@year=2003 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_1d partition(year='2003') compute statistics for columns state,locid,cnt,zip +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_1d +POSTHOOK: Input: default@loc_orc_1d@year=2003 +#### A masked pattern was here #### +PREHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2000') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2000') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +state string 0 2 0.5 1 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2003') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.state PARTITION(year='2003') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +state string 0 4 1.25 4 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2000') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2000') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid double 1.0 2.0 0 2 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2003') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.locid PARTITION(year='2003') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid double 1.0 31.0 0 6 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2000') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2000') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +cnt decimal(10,0) 1000 1010 0 3 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2003') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.cnt PARTITION(year='2003') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +cnt decimal(10,0) 1000 2000 0 3 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2000') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2000') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +zip int 94086 94087 0 2 from deserializer +PREHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2003') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d.zip PARTITION(year='2003') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +zip int 43201 94087 0 4 from deserializer +PREHOOK: query: explain extended select state,locid,cnt,zip from loc_orc_1d +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select state,locid,cnt,zip from loc_orc_1d +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + loc_orc_1d + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + state + TOK_SELEXPR + TOK_TABLE_OR_COL + locid + TOK_SELEXPR + TOK_TABLE_OR_COL + cnt + TOK_SELEXPR + TOK_TABLE_OR_COL + zip + + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2000 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 2 + partition_columns year + partition_columns.types string + rawDataSize 416 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 536 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 4 + partition_columns year + partition_columns.types string + rawDataSize 832 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 570 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2002 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 6 + partition_columns year + partition_columns.types string + rawDataSize 1266 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 586 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2003 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + numFiles 1 + numRows 8 + partition_columns year + partition_columns.types string + rawDataSize 1672 + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 610 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt,zip + columns.comments + columns.types string:double:decimal(10,0):int +#### A masked pattern was here #### + name default.loc_orc_1d + partition_columns year + partition_columns.types string + serialization.ddl struct loc_orc_1d { string state, double locid, decimal(10,0) cnt, i32 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_1d + name: default.loc_orc_1d + Processor Tree: + TableScan + alias: loc_orc_1d + Statistics: Num rows: 20 Data size: 4186 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Select Operator + expressions: state (type: string), locid (type: double), cnt (type: decimal(10,0)), zip (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 20 Data size: 4220 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: drop table if exists loc_orc_2d +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists loc_orc_2d +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table loc_orc_2d ( + state string, + locid int, + cnt decimal +) partitioned by(zip int, year string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@loc_orc_2d +POSTHOOK: query: create table loc_orc_2d ( + state string, + locid int, + cnt decimal +) partitioned by(zip int, year string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@loc_orc_2d +PREHOOK: query: insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc +PREHOOK: type: QUERY +PREHOOK: Input: default@ext_loc +PREHOOK: Output: default@loc_orc_2d +POSTHOOK: query: insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ext_loc +POSTHOOK: Output: default@loc_orc_2d@zip=43201/year=2001 +POSTHOOK: Output: default@loc_orc_2d@zip=43201/year=2002 +POSTHOOK: Output: default@loc_orc_2d@zip=43201/year=2003 +POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2000 +POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2001 +POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2002 +POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2003 +POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2000 +POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2001 +POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2002 +POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2003 +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2001).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2001).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2002).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2002).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2002).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2003).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2003).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=43201,year=2003).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2002).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2002).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2002).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2003).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2003).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2003).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2002).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2002).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2002).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2003).cnt SIMPLE [(ext_loc)ext_loc.FieldSchema(name:cnt, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2003).locid EXPRESSION [(ext_loc)ext_loc.FieldSchema(name:locid, type:double, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2003).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +PREHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns state,locid,cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_2d +PREHOOK: Input: default@loc_orc_2d@zip=94086/year=2001 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns state,locid,cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_2d +POSTHOOK: Input: default@loc_orc_2d@zip=94086/year=2001 +#### A masked pattern was here #### +PREHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2002') compute statistics for columns state,locid,cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_2d +PREHOOK: Input: default@loc_orc_2d@zip=94087/year=2002 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2002') compute statistics for columns state,locid,cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_2d +POSTHOOK: Input: default@loc_orc_2d@zip=94087/year=2002 +#### A masked pattern was here #### +PREHOOK: query: describe formatted loc_orc_2d.state partition(zip=94086, year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d.state partition(zip=94086, year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +state string 0 2 0.5 1 from deserializer +PREHOOK: query: describe formatted loc_orc_2d.state partition(zip=94087, year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d.state partition(zip=94087, year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +state string 0 4 3.0 3 from deserializer +PREHOOK: query: describe formatted loc_orc_2d.locid partition(zip=94086, year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d.locid partition(zip=94086, year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 2 3 0 2 from deserializer +PREHOOK: query: describe formatted loc_orc_2d.locid partition(zip=94087, year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d.locid partition(zip=94087, year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 5 0 3 from deserializer +PREHOOK: query: describe formatted loc_orc_2d.cnt partition(zip=94086, year='2001') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d.cnt partition(zip=94086, year='2001') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +cnt decimal(10,0) 1000 2000 0 2 from deserializer +PREHOOK: query: describe formatted loc_orc_2d.cnt partition(zip=94087, year='2002') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d.cnt partition(zip=94087, year='2002') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +cnt decimal(10,0) 10 100 0 2 from deserializer +PREHOOK: query: explain extended select state,locid,cnt,zip from loc_orc_2d +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select state,locid,cnt,zip from loc_orc_2d +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + loc_orc_2d + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + state + TOK_SELEXPR + TOK_TABLE_OR_COL + locid + TOK_SELEXPR + TOK_TABLE_OR_COL + cnt + TOK_SELEXPR + TOK_TABLE_OR_COL + zip + + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + zip 43201 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 1 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 202 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 393 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2002 + zip 43201 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 2 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 406 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 415 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2003 + zip 43201 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 3 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 603 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 431 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2000 + zip 94086 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 1 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 201 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 391 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + zip 94086 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 2 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 400 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 400 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2002 + zip 94086 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 1 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 203 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 393 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2003 + zip 94086 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 2 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 404 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 418 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2000 + zip 94087 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 1 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 200 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 375 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + zip 94087 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 1 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 200 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 368 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2002 + zip 94087 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 3 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 609 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 419 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2003 + zip 94087 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + numFiles 1 + numRows 3 + partition_columns zip/year + partition_columns.types int:string + rawDataSize 600 + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 422 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,cnt + columns.comments + columns.types string:int:decimal(10,0) +#### A masked pattern was here #### + name default.loc_orc_2d + partition_columns zip/year + partition_columns.types int:string + serialization.ddl struct loc_orc_2d { string state, i32 locid, decimal(10,0) cnt} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc_2d + name: default.loc_orc_2d + Processor Tree: + TableScan + alias: loc_orc_2d + Statistics: Num rows: 20 Data size: 4028 Basic stats: COMPLETE Column stats: PARTIAL + GatherStats: false + Select Operator + expressions: state (type: string), locid (type: int), cnt (type: decimal(10,0)), zip (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 20 Data size: 4160 Basic stats: COMPLETE Column stats: PARTIAL + ListSink +