diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index fa1a4fb603..14a93a15ac 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -670,6 +670,7 @@ minillaplocal.query.files=\ special_character_in_tabnames_1.q,\ sqlmerge.q,\ stats_based_fetch_decision.q,\ + stats_only_external.q,\ subquery_in_having.q,\ subquery_notin.q,\ subquery_nested_subquery.q, \ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 5788d4906e..857f3004d7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -59,6 +59,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin; @@ -447,7 +448,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = desc.getColumn(); StatType type = getType(desc.getTypeString()); if (!tbl.isPartitioned()) { - if (!StatsSetupConst.areBasicStatsUptoDate(tbl.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { Logger.debug("Stats for table : " + tbl.getTableName() + " are not up to date."); return null; } @@ -456,7 +457,7 @@ else if (udaf instanceof GenericUDAFCount) { Logger.debug("Table doesn't have up to date stats " + tbl.getTableName()); return null; } - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForQueryAnswering(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -479,7 +480,7 @@ else if (udaf instanceof GenericUDAFCount) { Set parts = pctx.getPrunedPartitions(tsOp.getConf().getAlias(), tsOp) .getPartitions(); for (Partition part : parts) { - if (!StatsSetupConst.areBasicStatsUptoDate(part.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(part.getTable(), part.getParameters())) { Logger.debug("Stats for part : " + part.getSpec() + " are not up to date."); return null; } @@ -517,7 +518,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = colDesc.getColumn(); StatType type = getType(colDesc.getTypeString()); if(!tbl.isPartitioned()) { - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForQueryAnswering(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -658,7 +659,7 @@ else if (udaf instanceof GenericUDAFCount) { String colName = colDesc.getColumn(); StatType type = getType(colDesc.getTypeString()); if (!tbl.isPartitioned()) { - if (!StatsSetupConst.areColumnStatsUptoDate(tbl.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForQueryAnswering(tbl, tbl.getParameters(), colName)) { Logger.debug("Stats for table : " + tbl.getTableName() + " column " + colName + " are not up to date."); return null; @@ -889,7 +890,7 @@ private ColumnStatisticsData validateSingleColStat(List sta Hive hive, Table tbl, String colName, Set parts) throws TException { List partNames = new ArrayList(parts.size()); for (Partition part : parts) { - if (!StatsSetupConst.areColumnStatsUptoDate(part.getParameters(), colName)) { + if (!StatsUtils.areColumnStatsUptoDateForQueryAnswering(part.getTable(), part.getParameters(), colName)) { Logger.debug("Stats for part : " + part.getSpec() + " column " + colName + " are not up to date."); return null; @@ -911,7 +912,7 @@ private Long getRowCnt( if (tbl.isPartitioned()) { for (Partition part : pctx.getPrunedPartitions( tsOp.getConf().getAlias(), tsOp).getPartitions()) { - if (!StatsSetupConst.areBasicStatsUptoDate(part.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(part.getTable(), part.getParameters())) { return null; } Long partRowCnt = Long.parseLong(part.getParameters().get(StatsSetupConst.ROW_COUNT)); @@ -922,7 +923,7 @@ private Long getRowCnt( rowCnt += partRowCnt; } } else { // unpartitioned table - if (!StatsSetupConst.areBasicStatsUptoDate(tbl.getParameters())) { + if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { return null; } rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT)); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java index 1edef98ea9..085ad3e919 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java @@ -296,8 +296,8 @@ private ColStatistics extractColStats(RexInputRef ref) { if (table != null) { ColStatistics colStats = table.getColStat(Lists.newArrayList(columnOrigin.getOriginColumnOrdinal())).get(0); - if (colStats != null && StatsSetupConst.areColumnStatsUptoDate( - table.getHiveTableMD().getParameters(), colStats.getColumnName())) { + if (colStats != null && StatsUtils.areColumnStatsUptoDateForQueryAnswering( + table.getHiveTableMD(), table.getHiveTableMD().getParameters(), colStats.getColumnName())) { return colStats; } } @@ -310,7 +310,8 @@ private Long extractRowCount(RexInputRef ref) { if (columnOrigin != null) { RelOptHiveTable table = (RelOptHiveTable) columnOrigin.getOriginTable(); if (table != null) { - if (StatsSetupConst.areBasicStatsUptoDate(table.getHiveTableMD().getParameters())) { + if (StatsUtils.areBasicStatsUptoDateForQueryAnswering(table.getHiveTableMD(), + table.getHiveTableMD().getParameters())) { return StatsUtils.getNumRows(table.getHiveTableMD()); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 952b4abb2a..e00098529f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.RowSchema; @@ -1996,4 +1997,28 @@ public static Range combineRange(Range range1, Range range2) { } return null; } + + /** + * Are the basic stats for the table up-to-date for query planning. + * Can run additional checks compared to the version in StatsSetupConst. + */ + public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map params) { + // HIVE-19332: external tables should not be considered to have up-to-date stats. + if (MetaStoreUtils.isExternalTable(table.getTTable())) { + return false; + } + return StatsSetupConst.areBasicStatsUptoDate(params); + } + + /** + * Are the column stats for the table up-to-date for query planning. + * Can run additional checks compared to the version in StatsSetupConst. + */ + public static boolean areColumnStatsUptoDateForQueryAnswering(Table table, Map params, String colName) { + // HIVE-19332: external tables should not be considered to have up-to-date stats. + if (MetaStoreUtils.isExternalTable(table.getTTable())) { + return false; + } + return StatsSetupConst.areColumnStatsUptoDate(params, colName); + } } diff --git a/ql/src/test/queries/clientpositive/stats_only_external.q b/ql/src/test/queries/clientpositive/stats_only_external.q new file mode 100644 index 0000000000..f379fc1523 --- /dev/null +++ b/ql/src/test/queries/clientpositive/stats_only_external.q @@ -0,0 +1,35 @@ +--! qt:dataset:src + +set hive.compute.query.using.stats=true; +set hive.optimize.filter.stats.reduction=true; + +drop table if exists stats_only_external_tab1; +drop table if exists stats_only_external_tab1_ext; + +create table stats_only_external_tab1 (key int, value string) stored as orc; +create external table stats_only_external_tab1_ext like stats_only_external_tab1 stored as orc; + +insert into stats_only_external_tab1 select * from src; +insert into stats_only_external_tab1_ext select * from src; + +analyze table stats_only_external_tab1 compute statistics for columns; +analyze table stats_only_external_tab1_ext compute statistics for columns; + +set test.comment=Regular table should should compute using stats; +set test.comment; +explain select count(*) from stats_only_external_tab1; + +set test.comment=External table should not should compute using stats; +set test.comment; +explain select count(*) from stats_only_external_tab1_ext; + +set test.comment=Query predicates removed due to column stats; +set test.comment; +explain select count(*) from stats_only_external_tab1 where value is not null and key >= 0; + +set test.comment=Predicate removal disabled for external tables; +set test.comment; +explain select count(*) from stats_only_external_tab1_ext where value is not null and key >= 0; + +drop table stats_only_external_tab1; +drop table stats_only_external_tab1_ext; diff --git a/ql/src/test/results/clientpositive/llap/stats_only_external.q.out b/ql/src/test/results/clientpositive/llap/stats_only_external.q.out new file mode 100644 index 0000000000..554aa3e257 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/stats_only_external.q.out @@ -0,0 +1,227 @@ +PREHOOK: query: drop table if exists stats_only_external_tab1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists stats_only_external_tab1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists stats_only_external_tab1_ext +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists stats_only_external_tab1_ext +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table stats_only_external_tab1 (key int, value string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@stats_only_external_tab1 +POSTHOOK: query: create table stats_only_external_tab1 (key int, value string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@stats_only_external_tab1 +PREHOOK: query: create external table stats_only_external_tab1_ext like stats_only_external_tab1 stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@stats_only_external_tab1_ext +POSTHOOK: query: create external table stats_only_external_tab1_ext like stats_only_external_tab1 stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@stats_only_external_tab1_ext +PREHOOK: query: insert into stats_only_external_tab1 select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@stats_only_external_tab1 +POSTHOOK: query: insert into stats_only_external_tab1 select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@stats_only_external_tab1 +POSTHOOK: Lineage: stats_only_external_tab1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_only_external_tab1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into stats_only_external_tab1_ext select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@stats_only_external_tab1_ext +POSTHOOK: query: insert into stats_only_external_tab1_ext select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@stats_only_external_tab1_ext +POSTHOOK: Lineage: stats_only_external_tab1_ext.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: stats_only_external_tab1_ext.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: analyze table stats_only_external_tab1 compute statistics for columns +PREHOOK: type: ANALYZE_TABLE +PREHOOK: Input: default@stats_only_external_tab1 +PREHOOK: Output: default@stats_only_external_tab1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table stats_only_external_tab1 compute statistics for columns +POSTHOOK: type: ANALYZE_TABLE +POSTHOOK: Input: default@stats_only_external_tab1 +POSTHOOK: Output: default@stats_only_external_tab1 +#### A masked pattern was here #### +PREHOOK: query: analyze table stats_only_external_tab1_ext compute statistics for columns +PREHOOK: type: ANALYZE_TABLE +PREHOOK: Input: default@stats_only_external_tab1_ext +PREHOOK: Output: default@stats_only_external_tab1_ext +#### A masked pattern was here #### +POSTHOOK: query: analyze table stats_only_external_tab1_ext compute statistics for columns +POSTHOOK: type: ANALYZE_TABLE +POSTHOOK: Input: default@stats_only_external_tab1_ext +POSTHOOK: Output: default@stats_only_external_tab1_ext +#### A masked pattern was here #### +test.comment=Regular table should should compute using stats +PREHOOK: query: explain select count(*) from stats_only_external_tab1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats_only_external_tab1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +test.comment=External table should not should compute using stats +PREHOOK: query: explain select count(*) from stats_only_external_tab1_ext +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats_only_external_tab1_ext +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: stats_only_external_tab1_ext + Statistics: Num rows: 500 Data size: 47000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 47000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +test.comment=Query predicates removed due to column stats +PREHOOK: query: explain select count(*) from stats_only_external_tab1 where value is not null and key >= 0 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats_only_external_tab1 where value is not null and key >= 0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +test.comment=Predicate removal disabled for external tables +PREHOOK: query: explain select count(*) from stats_only_external_tab1_ext where value is not null and key >= 0 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats_only_external_tab1_ext where value is not null and key >= 0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: stats_only_external_tab1_ext + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((key >= 0) and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table stats_only_external_tab1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@stats_only_external_tab1 +PREHOOK: Output: default@stats_only_external_tab1 +POSTHOOK: query: drop table stats_only_external_tab1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@stats_only_external_tab1 +POSTHOOK: Output: default@stats_only_external_tab1 +PREHOOK: query: drop table stats_only_external_tab1_ext +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@stats_only_external_tab1_ext +PREHOOK: Output: default@stats_only_external_tab1_ext +POSTHOOK: query: drop table stats_only_external_tab1_ext +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@stats_only_external_tab1_ext +POSTHOOK: Output: default@stats_only_external_tab1_ext