diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index cf6d19a593..2330cca559 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -680,6 +680,7 @@ minillaplocal.query.files=\ special_character_in_tabnames_1.q,\ sqlmerge.q,\ stats_based_fetch_decision.q,\ + stats_only_external.q,\ subquery_in_having.q,\ subquery_notin.q,\ subquery_nested_subquery.q, \ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index a574372dbe..ca9db477c6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -68,6 +68,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin; @@ -446,7 +447,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = desc.getColumn(); StatType type = getType(desc.getTypeString()); if (!tbl.isPartitioned()) { - if (!StatsSetupConst.areBasicStatsUptoDate(tbl.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForPlanning(tbl, tbl.getParameters())) { Logger.debug("Stats for table : " + tbl.getTableName() + " are not up to date."); return null; } @@ -455,7 +456,7 @@ else if (udaf instanceof GenericUDAFCount) { Logger.debug("Table doesn't have up to date stats " + tbl.getTableName()); return null; } - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForPlanning(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -478,7 +479,7 @@ else if (udaf instanceof GenericUDAFCount) { Set parts = pctx.getPrunedPartitions(tsOp.getConf().getAlias(), tsOp) .getPartitions(); for (Partition part : parts) { - if (!StatsSetupConst.areBasicStatsUptoDate(part.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForPlanning(part.getTable(), part.getParameters())) { Logger.debug("Stats for part : " + part.getSpec() + " are not up to date."); return null; } @@ -516,7 +517,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = colDesc.getColumn(); StatType type = getType(colDesc.getTypeString()); if(!tbl.isPartitioned()) { - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForPlanning(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -657,7 +658,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = colDesc.getColumn(); StatType type = getType(colDesc.getTypeString()); if (!tbl.isPartitioned()) { - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForPlanning(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -888,7 +889,7 @@ private ColumnStatisticsData validateSingleColStat(List sta Hive hive, Table tbl, String colName, Set parts) throws TException { List partNames = new ArrayList(parts.size()); for (Partition part : parts) { - if (!StatsSetupConst.areColumnStatsUptoDate(part.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForPlanning(part.getTable(), part.getParameters(), colName)) { Logger.debug("Stats for part : " + part.getSpec() + " column " + colName + " are not up to date."); return null; @@ -910,7 +911,7 @@ private Long getRowCnt( if (tbl.isPartitioned()) { for (Partition part : pctx.getPrunedPartitions( tsOp.getConf().getAlias(), tsOp).getPartitions()) { - if (!StatsSetupConst.areBasicStatsUptoDate(part.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForPlanning(part.getTable(), part.getParameters())) { return null; } Long partRowCnt = Long.parseLong(part.getParameters().get(StatsSetupConst.ROW_COUNT)); @@ -921,7 +922,7 @@ private Long getRowCnt( rowCnt += partRowCnt; } } else { // unpartitioned table - if (!StatsSetupConst.areBasicStatsUptoDate(tbl.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForPlanning(tbl, tbl.getParameters())) { return null; } rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT)); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java index 1edef98ea9..b0d658ebbb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java @@ -296,8 +296,8 @@ private ColStatistics extractColStats(RexInputRef ref) { if (table != null) { ColStatistics colStats = table.getColStat(Lists.newArrayList(columnOrigin.getOriginColumnOrdinal())).get(0); - if (colStats != null && StatsSetupConst.areColumnStatsUptoDate( - table.getHiveTableMD().getParameters(), colStats.getColumnName())) { + if (colStats != null && StatsUtils.areColumnStatsUptoDateForPlanning( + table.getHiveTableMD(), table.getHiveTableMD().getParameters(), colStats.getColumnName())) { return colStats; } } @@ -310,7 +310,8 @@ private Long extractRowCount(RexInputRef ref) { if (columnOrigin != null) { RelOptHiveTable table = (RelOptHiveTable) columnOrigin.getOriginTable(); if (table != null) { - if (StatsSetupConst.areBasicStatsUptoDate(table.getHiveTableMD().getParameters())) { + if (StatsUtils.areBasicStatsUptoDateForPlanning(table.getHiveTableMD(), + table.getHiveTableMD().getParameters())) { return StatsUtils.getNumRows(table.getHiveTableMD()); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index cef87f5957..7d2b54166e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.RowSchema; @@ -1989,4 +1990,28 @@ public static Range combineRange(Range range1, Range range2) { } return null; } + + /** + * Are the basic stats for the table up-to-date for query planning. + * Can run additional checks compared to the version in StatsSetupConst. + */ + public static boolean areBasicStatsUptoDateForPlanning(Table table, Map params) { + // HIVE-19332: external tables should not be considered to have up-to-date stats. + if (MetaStoreUtils.isExternalTable(table.getTTable())) { + return false; + } + return StatsSetupConst.areBasicStatsUptoDate(params); + } + + /** + * Are the column stats for the table up-to-date for query planning. + * Can run additional checks compared to the version in StatsSetupConst. + */ + public static boolean areColumnStatsUptoDateForPlanning(Table table, Map params, String colName) { + // HIVE-19332: external tables should not be considered to have up-to-date stats. + if (MetaStoreUtils.isExternalTable(table.getTTable())) { + return false; + } + return StatsSetupConst.areColumnStatsUptoDate(params, colName); + } } diff --git a/ql/src/test/queries/clientpositive/stats_only_external.q b/ql/src/test/queries/clientpositive/stats_only_external.q new file mode 100644 index 0000000000..44da9ac6a5 --- /dev/null +++ b/ql/src/test/queries/clientpositive/stats_only_external.q @@ -0,0 +1,30 @@ +--! qt:dataset:src + +set hive.compute.query.using.stats=true; +set hive.optimize.filter.stats.reduction=true; + +create table tab1 (key int, value string) stored as orc; +create external table tab1_ext like tab1 stored as orc; + +insert into tab1 select * from src; +insert into tab1_ext select * from src; + +analyze table tab1 compute statistics for columns; +analyze table tab1_ext compute statistics for columns; + +set test.comment=Regular table should should compute using stats; +set test.comment; +explain select count(*) from tab1; + +set test.comment=External table should not should compute using stats; +set test.comment; +explain select count(*) from tab1_ext; + +set test.comment=Query predicates removed due to column stats; +set test.comment; +explain select count(*) from tab1 where value is not null and key >= 0; + +set test.comment=Predicate removal disabled for external tables; +set test.comment; +explain select count(*) from tab1_ext where value is not null and key >= 0; + diff --git a/ql/src/test/results/clientpositive/llap/stats_only_external.q.out b/ql/src/test/results/clientpositive/llap/stats_only_external.q.out new file mode 100644 index 0000000000..393377aa4b --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/stats_only_external.q.out @@ -0,0 +1,203 @@ +PREHOOK: query: create table tab1 (key int, value string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab1 +POSTHOOK: query: create table tab1 (key int, value string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab1 +PREHOOK: query: create external table tab1_ext like tab1 stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab1_ext +POSTHOOK: query: create external table tab1_ext like tab1 stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab1_ext +PREHOOK: query: insert into tab1 select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tab1 +POSTHOOK: query: insert into tab1 select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tab1 +POSTHOOK: Lineage: tab1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tab1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into tab1_ext select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tab1_ext +POSTHOOK: query: insert into tab1_ext select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tab1_ext +POSTHOOK: Lineage: tab1_ext.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tab1_ext.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: analyze table tab1 compute statistics for columns +PREHOOK: type: ANALYZE_TABLE +PREHOOK: Input: default@tab1 +PREHOOK: Output: default@tab1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table tab1 compute statistics for columns +POSTHOOK: type: ANALYZE_TABLE +POSTHOOK: Input: default@tab1 +POSTHOOK: Output: default@tab1 +#### A masked pattern was here #### +PREHOOK: query: analyze table tab1_ext compute statistics for columns +PREHOOK: type: ANALYZE_TABLE +PREHOOK: Input: default@tab1_ext +PREHOOK: Output: default@tab1_ext +#### A masked pattern was here #### +POSTHOOK: query: analyze table tab1_ext compute statistics for columns +POSTHOOK: type: ANALYZE_TABLE +POSTHOOK: Input: default@tab1_ext +POSTHOOK: Output: default@tab1_ext +#### A masked pattern was here #### +test.comment=Regular table should should compute using stats +PREHOOK: query: explain select count(*) from tab1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +test.comment=External table should not should compute using stats +PREHOOK: query: explain select count(*) from tab1_ext +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1_ext +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tab1_ext + Statistics: Num rows: 500 Data size: 47000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 47000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +test.comment=Query predicates removed due to column stats +PREHOOK: query: explain select count(*) from tab1 where value is not null and key >= 0 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1 where value is not null and key >= 0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +test.comment=Predicate removal disabled for external tables +PREHOOK: query: explain select count(*) from tab1_ext where value is not null and key >= 0 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from tab1_ext where value is not null and key >= 0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tab1_ext + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((key >= 0) and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +