diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 0e4e7064eb..1cdabff508 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -399,6 +399,7 @@ minillap.query.files=acid_bucket_pruning.q,\ reduce_deduplicate.q,\ reduce_deduplicate_distinct.q, \ remote_script.q,\ + stats_only_external.q,\ tez_aggr_part_stats.q,\ tez_union_view.q,\ file_with_header_footer.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java index 3141a7e981..9888af238d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java @@ -3656,7 +3656,7 @@ private int describeTable(Hive db, DescTableDesc descTbl) throws HiveException, int numParts = 0; for (Partition partition : parts) { Map props = partition.getParameters(); - Boolean state = StatsSetupConst.areBasicStatsUptoDate(props); + Boolean state = StatsUtils.areBasicStatsUptoDate(tbl, props); for (String stat : StatsSetupConst.supportedStats) { stateMap.put(stat, stateMap.get(stat) && state); if (props != null && props.get(stat) != null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index a574372dbe..db1fc44f17 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -68,6 +68,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin; @@ -446,7 +447,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = desc.getColumn(); StatType type = getType(desc.getTypeString()); if (!tbl.isPartitioned()) { - if (!StatsSetupConst.areBasicStatsUptoDate(tbl.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDate(tbl, tbl.getParameters())) { Logger.debug("Stats for table : " + tbl.getTableName() + " are not up to date."); return null; } @@ -455,7 +456,7 @@ else if (udaf instanceof GenericUDAFCount) { Logger.debug("Table doesn't have up to date stats " + tbl.getTableName()); return null; } - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDate(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -478,7 +479,7 @@ else if (udaf instanceof GenericUDAFCount) { Set parts = pctx.getPrunedPartitions(tsOp.getConf().getAlias(), tsOp) .getPartitions(); for (Partition part : parts) { - if (!StatsSetupConst.areBasicStatsUptoDate(part.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDate(part.getTable(), part.getParameters())) { Logger.debug("Stats for part : " + part.getSpec() + " are not up to date."); return null; } @@ -516,7 +517,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = colDesc.getColumn(); StatType type = getType(colDesc.getTypeString()); if(!tbl.isPartitioned()) { - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDate(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -657,7 +658,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = colDesc.getColumn(); StatType type = getType(colDesc.getTypeString()); if (!tbl.isPartitioned()) { - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDate(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -888,7 +889,7 @@ private ColumnStatisticsData validateSingleColStat(List sta Hive hive, Table tbl, String colName, Set parts) throws TException { List partNames = new ArrayList(parts.size()); for (Partition part : parts) { - if (!StatsSetupConst.areColumnStatsUptoDate(part.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDate(part.getTable(), part.getParameters(), colName)) { Logger.debug("Stats for part : " + part.getSpec() + " column " + colName + " are not up to date."); return null; @@ -910,7 +911,7 @@ private Long getRowCnt( if (tbl.isPartitioned()) { for (Partition part : pctx.getPrunedPartitions( tsOp.getConf().getAlias(), tsOp).getPartitions()) { - if (!StatsSetupConst.areBasicStatsUptoDate(part.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDate(part.getTable(), part.getParameters())) { return null; } Long partRowCnt = Long.parseLong(part.getParameters().get(StatsSetupConst.ROW_COUNT)); @@ -921,7 +922,7 @@ private Long getRowCnt( rowCnt += partRowCnt; } } else { // unpartitioned table - if (!StatsSetupConst.areBasicStatsUptoDate(tbl.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDate(tbl, tbl.getParameters())) { return null; } rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT)); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java index 1edef98ea9..c820088d8b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java @@ -296,8 +296,8 @@ private ColStatistics extractColStats(RexInputRef ref) { if (table != null) { ColStatistics colStats = table.getColStat(Lists.newArrayList(columnOrigin.getOriginColumnOrdinal())).get(0); - if (colStats != null && StatsSetupConst.areColumnStatsUptoDate( - table.getHiveTableMD().getParameters(), colStats.getColumnName())) { + if (colStats != null && StatsUtils.areColumnStatsUptoDate( + table.getHiveTableMD(), table.getHiveTableMD().getParameters(), colStats.getColumnName())) { return colStats; } } @@ -310,7 +310,8 @@ private Long extractRowCount(RexInputRef ref) { if (columnOrigin != null) { RelOptHiveTable table = (RelOptHiveTable) columnOrigin.getOriginTable(); if (table != null) { - if (StatsSetupConst.areBasicStatsUptoDate(table.getHiveTableMD().getParameters())) { + if (StatsUtils.areBasicStatsUptoDate(table.getHiveTableMD(), + table.getHiveTableMD().getParameters())) { return StatsUtils.getNumRows(table.getHiveTableMD()); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java index 8c23887176..8ea463cb31 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java @@ -167,7 +167,7 @@ public Object process(StatsAggregator statsAggregator) throws HiveException, Met if (statsAggregator != null) { // Update stats for transactional tables (MM, or full ACID with overwrite), even // though we are marking stats as not being accurate. - if (StatsSetupConst.areBasicStatsUptoDate(parameters) || p.isTransactionalTable()) { + if (StatsUtils.areBasicStatsUptoDate(p.getTable(), parameters) || p.isTransactionalTable()) { String prefix = getAggregationPrefix(p.getTable(), p.getPartition()); updateStats(statsAggregator, parameters, prefix, p.isAcid()); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index cef87f5957..d9537de2c1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1989,4 +1989,18 @@ public static Range combineRange(Range range1, Range range2) { } return null; } + + /** + * Same as equivalent method in StatsSetupConst, but with ql.Table param + */ + public static boolean areBasicStatsUptoDate(Table table, Map params) { + return StatsSetupConst.areBasicStatsUptoDate(table.getTTable(), params); + } + + /** + * Same as equivalent method in StatsSetupConst, but with ql.Table param + */ + public static boolean areColumnStatsUptoDate(Table table, Map params, String colName) { + return StatsSetupConst.areColumnStatsUptoDate(table.getTTable(), params, colName); + } } diff --git a/ql/src/test/queries/clientpositive/stats_only_external.q b/ql/src/test/queries/clientpositive/stats_only_external.q new file mode 100644 index 0000000000..44da9ac6a5 --- /dev/null +++ b/ql/src/test/queries/clientpositive/stats_only_external.q @@ -0,0 +1,30 @@ +--! qt:dataset:src + +set hive.compute.query.using.stats=true; +set hive.optimize.filter.stats.reduction=true; + +create table tab1 (key int, value string) stored as orc; +create external table tab1_ext like tab1 stored as orc; + +insert into tab1 select * from src; +insert into tab1_ext select * from src; + +analyze table tab1 compute statistics for columns; +analyze table tab1_ext compute statistics for columns; + +set test.comment=Regular table should should compute using stats; +set test.comment; +explain select count(*) from tab1; + +set test.comment=External table should not should compute using stats; +set test.comment; +explain select count(*) from tab1_ext; + +set test.comment=Query predicates removed due to column stats; +set test.comment; +explain select count(*) from tab1 where value is not null and key >= 0; + +set test.comment=Predicate removal disabled for external tables; +set test.comment; +explain select count(*) from tab1_ext where value is not null and key >= 0; + diff --git a/ql/src/test/results/clientpositive/llap/stats_only_external.q.out b/ql/src/test/results/clientpositive/llap/stats_only_external.q.out new file mode 100644 index 0000000000..366b05825f --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/stats_only_external.q.out @@ -0,0 +1,203 @@ +PREHOOK: query: create table tab1 (key int, value string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab1 +POSTHOOK: query: create table tab1 (key int, value string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab1 +PREHOOK: query: create external table tab1_ext like tab1 stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab1_ext +POSTHOOK: query: create external table tab1_ext like tab1 stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab1_ext +PREHOOK: query: insert into tab1 select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tab1 +POSTHOOK: query: insert into tab1 select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tab1 +POSTHOOK: Lineage: tab1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tab1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into tab1_ext select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tab1_ext +POSTHOOK: query: insert into tab1_ext select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tab1_ext +POSTHOOK: Lineage: tab1_ext.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tab1_ext.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: analyze table tab1 compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@tab1 +PREHOOK: Output: default@tab1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table tab1 compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab1 +POSTHOOK: Output: default@tab1 +#### A masked pattern was here #### +PREHOOK: query: analyze table tab1_ext compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@tab1_ext +PREHOOK: Output: default@tab1_ext +#### A masked pattern was here #### +POSTHOOK: query: analyze table tab1_ext compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab1_ext +POSTHOOK: Output: default@tab1_ext +#### A masked pattern was here #### +test.comment=Regular table should should compute using stats +PREHOOK: query: explain select count(*) from tab1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +test.comment=External table should not should compute using stats +PREHOOK: query: explain select count(*) from tab1_ext +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1_ext +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tab1_ext + Statistics: Num rows: 500 Data size: 47000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 47000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +test.comment=Query predicates removed due to column stats +PREHOOK: query: explain select count(*) from tab1 where value is not null and key >= 0 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1 where value is not null and key >= 0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +test.comment=Predicate removal disabled for external tables +PREHOOK: query: explain select count(*) from tab1_ext where value is not null and key >= 0 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1_ext where value is not null and key >= 0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tab1_ext + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((key >= 0) and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java index 78ea01d968..c8c9f1fa62 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java @@ -23,8 +23,10 @@ import java.util.TreeMap; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -199,16 +201,18 @@ public Boolean deserialize(JsonParser jsonParser, } - public static boolean areBasicStatsUptoDate(Map params) { - if (params == null) { + public static boolean areBasicStatsUptoDate(Table table, Map params) { + // HIVE-19332: external tables should not be considered to have up-to-date stats. + if (params == null || MetaStoreUtils.isExternalTable(table)) { return false; } ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); return stats.basicStats; } - public static boolean areColumnStatsUptoDate(Map params, String colName) { - if (params == null) { + public static boolean areColumnStatsUptoDate(Table table, Map params, String colName) { + // HIVE-19332: external tables should not be considered to have up-to-date stats. + if (params == null || MetaStoreUtils.isExternalTable(table)) { return false; } ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE));