diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 6e16200..d901709 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -272,6 +272,7 @@ private static URL checkConfigFile(File f) { HiveConf.ConfVars.HIVE_TXN_HEARTBEAT_THREADPOOL_SIZE, HiveConf.ConfVars.HIVE_TXN_MAX_OPEN_BATCH, HiveConf.ConfVars.HIVE_TXN_RETRYABLE_SQLEX_REGEX, + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER, HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_ENABLED, HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_SIZE, @@ -1683,6 +1684,10 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0, "Standard error expressed in percentage. Provides a tradeoff between accuracy and compute cost. \n" + "A lower value for error indicates higher accuracy and a higher compute cost."), + HIVE_METASTORE_STATS_NDV_TUNER("hive.stats.ndv.tuner", (float)0.0, + "Provides a tunable parameter between the lower bound and the higher bound of ndv for aggregate ndv across all the partitions. \n" + + "The lower bound is equal to the maximum of ndv of all the partitions. The higher bound is equal to the sum of ndv of all the partitions.\n" + + "Its value should be between 0.0 (i.e., choose lower bound) and 1.0 (i.e., choose higher bound)"), HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION("hive.metastore.stats.ndv.densityfunction", false, "Whether to use density function to estimate the NDV for the whole table based on the NDV of partitions"), HIVE_STATS_KEY_PREFIX("hive.stats.key.prefix", "", "", true), // internal usage only diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index 85a6d0d..3ee1fee9b 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -1197,7 +1197,7 @@ public ColumnStatistics getTableStats(final String dbName, final String tableNam } public AggrStats aggrColStatsForPartitions(String dbName, String tableName, - List partNames, List colNames, boolean useDensityFunctionForNDVEstimation) + List partNames, List colNames, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { if (colNames.isEmpty() || partNames.isEmpty()) { LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval"); @@ -1232,7 +1232,7 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, // Read aggregated stats for one column colStatsAggrFromDB = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, - partsFound, useDensityFunctionForNDVEstimation); + partsFound, useDensityFunctionForNDVEstimation, ndvTuner); if (!colStatsAggrFromDB.isEmpty()) { ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0); colStatsList.add(colStatsAggr); @@ -1245,7 +1245,7 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, - useDensityFunctionForNDVEstimation); + useDensityFunctionForNDVEstimation, ndvTuner); } LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " @@ -1308,14 +1308,14 @@ private long partsFoundForPartitions(final String dbName, final String tableName private List columnStatisticsObjForPartitions(final String dbName, final String tableName, final List partNames, List colNames, long partsFound, - final boolean useDensityFunctionForNDVEstimation) throws MetaException { + final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException { final boolean areAllPartsFound = (partsFound == partNames.size()); return runBatched(colNames, new Batchable() { public List run(final List inputColNames) throws MetaException { return runBatched(partNames, new Batchable() { public List run(List inputPartNames) throws MetaException { return columnStatisticsObjForPartitionsBatch(dbName, tableName, inputPartNames, - inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation); + inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); } }); } @@ -1325,7 +1325,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName /** Should be called with the list short enough to not trip up Oracle/etc. */ private List columnStatisticsObjForPartitionsBatch(String dbName, String tableName, List partNames, List colNames, boolean areAllPartsFound, - boolean useDensityFunctionForNDVEstimation) throws MetaException { + boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { // TODO: all the extrapolation logic should be moved out of this class, // only mechanical data retrieval should remain here. String commonPrefix = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", " @@ -1377,7 +1377,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName List list = ensureList(qResult); List colStats = new ArrayList(list.size()); for (Object[] row : list) { - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner)); Deadline.checkTimeout(); } query.closeAll(); @@ -1436,7 +1436,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName } list = ensureList(qResult); for (Object[] row : list) { - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner)); Deadline.checkTimeout(); } end = doTrace ? System.nanoTime() : 0; @@ -1583,7 +1583,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName query.closeAll(); } } - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner)); Deadline.checkTimeout(); } } @@ -1603,13 +1603,13 @@ private ColumnStatisticsObj prepareCSObj (Object[] row, int i) throws MetaExcept } private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, - boolean useDensityFunctionForNDVEstimation) throws MetaException { + boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { ColumnStatisticsData data = new ColumnStatisticsData(); ColumnStatisticsObj cso = new ColumnStatisticsObj((String) row[i++], (String) row[i++], data); Object llow = row[i++], lhigh = row[i++], dlow = row[i++], dhigh = row[i++], declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], avglen = row[i++], maxlen = row[i++], trues = row[i++], falses = row[i++], avgLong = row[i++], avgDouble = row[i++], avgDecimal = row[i++], sumDist = row[i++]; StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data, llow, lhigh, dlow, dhigh, declow, dechigh, nulls, dist, avglen, maxlen, trues, falses, avgLong, avgDouble, - avgDecimal, sumDist, useDensityFunctionForNDVEstimation); + avgDecimal, sumDist, useDensityFunctionForNDVEstimation, ndvTuner); return cso; } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java index 1f7b9ef..c39c725 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -7337,13 +7337,16 @@ protected ColumnStatistics getJdoResult( @Override public AggrStats get_aggr_stats_for(String dbName, String tblName, final List partNames, final List colNames) throws MetaException, NoSuchObjectException { - final boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(), HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + final boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(), + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + final double ndvTuner = HiveConf.getFloatVar(getConf(), + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER); return new GetHelper(dbName, tblName, true, false) { @Override protected AggrStats getSqlResult(GetHelper ctx) throws MetaException { return directSql.aggrColStatsForPartitions(dbName, tblName, partNames, - colNames, useDensityFunctionForNDVEstimation); + colNames, useDensityFunctionForNDVEstimation, ndvTuner); } @Override protected AggrStats getJdoResult(GetHelper ctx) diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java index b259dfa..fcf6f27 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java @@ -527,7 +527,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist, - boolean useDensityFunctionForNDVEstimation) throws MetaException { + boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { colType = colType.toLowerCase(); if (colType.equals("boolean")) { BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); @@ -561,23 +561,29 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData } long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); + long rangeBound = Long.MAX_VALUE; + if (lhigh != null && llow != null) { + rangeBound = MetaStoreDirectSql.extractSqlLong(lhigh) + - MetaStoreDirectSql.extractSqlLong(llow) + 1; + } + long estimation; if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) { // We have estimation, lowerbound and higherbound. We use estimation if // it is between lowerbound and higherbound. - long estimation = MetaStoreDirectSql + estimation = MetaStoreDirectSql .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong)); if (estimation < lowerBound) { - longStats.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - longStats.setNumDVs(higherBound); - } else { - longStats.setNumDVs(estimation); + estimation = higherBound; } } else { - longStats.setNumDVs(lowerBound); + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + estimation = Math.min(estimation, rangeBound); + longStats.setNumDVs(estimation); data.setLongStats(longStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -590,23 +596,29 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData } long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); + long rangeBound = Long.MAX_VALUE; + if (lhigh != null && llow != null) { + rangeBound = MetaStoreDirectSql.extractSqlLong(lhigh) + - MetaStoreDirectSql.extractSqlLong(llow) + 1; + } + long estimation; if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) { // We have estimation, lowerbound and higherbound. We use estimation if // it is between lowerbound and higherbound. - long estimation = MetaStoreDirectSql + estimation = MetaStoreDirectSql .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong)); if (estimation < lowerBound) { - dateStats.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - dateStats.setNumDVs(higherBound); - } else { - dateStats.setNumDVs(estimation); + estimation = higherBound; } } else { - dateStats.setNumDVs(lowerBound); + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + estimation = Math.min(estimation, rangeBound); + dateStats.setNumDVs(estimation); data.setDateStats(dateStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -632,7 +644,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData doubleStats.setNumDVs(estimation); } } else { - doubleStats.setNumDVs(lowerBound); + doubleStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner)); } data.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { @@ -673,7 +685,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData decimalStats.setNumDVs(estimation); } } else { - decimalStats.setNumDVs(lowerBound); + decimalStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner)); } data.setDecimalStats(decimalStats); } diff --git a/ql/src/test/queries/clientpositive/tunable_ndv.q b/ql/src/test/queries/clientpositive/tunable_ndv.q new file mode 100644 index 0000000..88de3d8 --- /dev/null +++ b/ql/src/test/queries/clientpositive/tunable_ndv.q @@ -0,0 +1,64 @@ +set hive.mapred.mode=nonstrict; +set hive.stats.fetch.column.stats=true; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.metastore.aggregate.stats.cache.enabled=false; + +create table if not exists ext_loc ( + state string, + locid int, + zip int, + year string +) row format delimited fields terminated by '|' stored as textfile; + +LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_full.txt' OVERWRITE INTO TABLE ext_loc; + +create table if not exists loc_orc_1d ( + state string, + locid int, + zip int +) partitioned by(year string) stored as orc; + +insert overwrite table loc_orc_1d partition(year) select * from ext_loc; + +analyze table loc_orc_1d compute statistics for columns state,locid; + +describe formatted loc_orc_1d partition(year=2000) locid; +describe formatted loc_orc_1d partition(year=2001) locid; + +describe formatted loc_orc_1d locid; + +set hive.stats.ndv.tuner=1.0; + +describe formatted loc_orc_1d locid; + +set hive.stats.ndv.tuner=0.5; + +describe formatted loc_orc_1d locid; + +create table if not exists loc_orc_2d ( + state string, + locid int +) partitioned by(zip int, year string) stored as orc; + +insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc; + +analyze table loc_orc_2d partition(zip=94086, year='2000') compute statistics for columns state,locid; + +analyze table loc_orc_2d partition(zip=94087, year='2000') compute statistics for columns state,locid; + +analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns state,locid; + +analyze table loc_orc_2d partition(zip=94087, year='2001') compute statistics for columns state,locid; + +set hive.stats.ndv.tuner=0.0; + +describe formatted loc_orc_2d locid; + +set hive.stats.ndv.tuner=1.0; + +describe formatted loc_orc_2d locid; + +set hive.stats.ndv.tuner=0.5; + +describe formatted loc_orc_2d locid; diff --git a/ql/src/test/results/clientpositive/tunable_ndv.q.out b/ql/src/test/results/clientpositive/tunable_ndv.q.out new file mode 100644 index 0000000..80df82c --- /dev/null +++ b/ql/src/test/results/clientpositive/tunable_ndv.q.out @@ -0,0 +1,220 @@ +PREHOOK: query: create table if not exists ext_loc ( + state string, + locid int, + zip int, + year string +) row format delimited fields terminated by '|' stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@ext_loc +POSTHOOK: query: create table if not exists ext_loc ( + state string, + locid int, + zip int, + year string +) row format delimited fields terminated by '|' stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ext_loc +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_full.txt' OVERWRITE INTO TABLE ext_loc +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@ext_loc +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_full.txt' OVERWRITE INTO TABLE ext_loc +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@ext_loc +PREHOOK: query: create table if not exists loc_orc_1d ( + state string, + locid int, + zip int +) partitioned by(year string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@loc_orc_1d +POSTHOOK: query: create table if not exists loc_orc_1d ( + state string, + locid int, + zip int +) partitioned by(year string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@loc_orc_1d +PREHOOK: query: insert overwrite table loc_orc_1d partition(year) select * from ext_loc +PREHOOK: type: QUERY +PREHOOK: Input: default@ext_loc +PREHOOK: Output: default@loc_orc_1d +POSTHOOK: query: insert overwrite table loc_orc_1d partition(year) select * from ext_loc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ext_loc +POSTHOOK: Output: default@loc_orc_1d@year=2000 +POSTHOOK: Output: default@loc_orc_1d@year=2001 +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip, type:int, comment:null), ] +PREHOOK: query: analyze table loc_orc_1d compute statistics for columns state,locid +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_1d +PREHOOK: Input: default@loc_orc_1d@year=2000 +PREHOOK: Input: default@loc_orc_1d@year=2001 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_1d compute statistics for columns state,locid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_1d +POSTHOOK: Input: default@loc_orc_1d@year=2000 +POSTHOOK: Input: default@loc_orc_1d@year=2001 +#### A masked pattern was here #### +PREHOOK: query: describe formatted loc_orc_1d partition(year=2000) locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d partition(year=2000) locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 2 0 2 from deserializer +PREHOOK: query: describe formatted loc_orc_1d partition(year=2001) locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d partition(year=2001) locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 5 from deserializer +PREHOOK: query: describe formatted loc_orc_1d locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 5 from deserializer +COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"locid\":\"true\"}} +PREHOOK: query: describe formatted loc_orc_1d locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 7 from deserializer +COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"locid\":\"true\"}} +PREHOOK: query: describe formatted loc_orc_1d locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_1d +POSTHOOK: query: describe formatted loc_orc_1d locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_1d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 6 from deserializer +COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"locid\":\"true\"}} +PREHOOK: query: create table if not exists loc_orc_2d ( + state string, + locid int +) partitioned by(zip int, year string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@loc_orc_2d +POSTHOOK: query: create table if not exists loc_orc_2d ( + state string, + locid int +) partitioned by(zip int, year string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@loc_orc_2d +PREHOOK: query: insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc +PREHOOK: type: QUERY +PREHOOK: Input: default@ext_loc +PREHOOK: Output: default@loc_orc_2d +POSTHOOK: query: insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ext_loc +POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2000 +POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2001 +POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2000 +POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2001 +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state, type:string, comment:null), ] +PREHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2000') compute statistics for columns state,locid +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_2d +PREHOOK: Input: default@loc_orc_2d@zip=94086/year=2000 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2000') compute statistics for columns state,locid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_2d +POSTHOOK: Input: default@loc_orc_2d@zip=94086/year=2000 +#### A masked pattern was here #### +PREHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2000') compute statistics for columns state,locid +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_2d +PREHOOK: Input: default@loc_orc_2d@zip=94087/year=2000 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2000') compute statistics for columns state,locid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_2d +POSTHOOK: Input: default@loc_orc_2d@zip=94087/year=2000 +#### A masked pattern was here #### +PREHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns state,locid +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_2d +PREHOOK: Input: default@loc_orc_2d@zip=94086/year=2001 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns state,locid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_2d +POSTHOOK: Input: default@loc_orc_2d@zip=94086/year=2001 +#### A masked pattern was here #### +PREHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2001') compute statistics for columns state,locid +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc_2d +PREHOOK: Input: default@loc_orc_2d@zip=94087/year=2001 +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2001') compute statistics for columns state,locid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc_2d +POSTHOOK: Input: default@loc_orc_2d@zip=94087/year=2001 +#### A masked pattern was here #### +PREHOOK: query: describe formatted loc_orc_2d locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 3 from deserializer +COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"locid\":\"true\"}} +PREHOOK: query: describe formatted loc_orc_2d locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 6 from deserializer +COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"locid\":\"true\"}} +PREHOOK: query: describe formatted loc_orc_2d locid +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@loc_orc_2d +POSTHOOK: query: describe formatted loc_orc_2d locid +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@loc_orc_2d +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +locid int 1 4 0 4 from deserializer +COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"locid\":\"true\"}}