diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index fdd8ecc77c..6f4b13fc14 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -382,6 +382,7 @@ minillap.query.files=acid_bucket_pruning.q,\ except_distinct.q,\ explainuser_2.q,\ empty_dir_in_table.q,\ + gby_stats_estimate.q,\ intersect_all.q,\ intersect_distinct.q,\ intersect_merge.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java index 456786c240..b3d9453d1b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java @@ -29,9 +29,12 @@ import java.util.Stack; import org.apache.hadoop.hive.ql.exec.NodeUtils.Function; +import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; import org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -42,6 +45,8 @@ import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; +import scala.Array; +import scala.reflect.macros.ExprUtils; public class OperatorUtils { @@ -501,4 +506,51 @@ public static boolean isInBranch(SparkPartitionPruningSinkOperator op) { } return; } + + private static List backtrackAll(List exprs, Operator start, + Operator terminal) { + List backtrackedExprs = new ArrayList<>(); + try { + for (ExprNodeDesc expr : exprs) { + ExprNodeDesc backtrackedExpr = ExprNodeDescUtils.backtrack(expr, start, terminal); + if(backtrackedExpr == null) { + return null; + } + backtrackedExprs.add(backtrackedExpr); + + } + } catch (SemanticException e) { + return null; + } + return backtrackedExprs; + } + + public static Operator findSourceRS(Operator start, List exprs) { + Operator currRS = null; + if(start instanceof ReduceSinkOperator) { + currRS = start; + } + List> parents = start.getParentOperators(); + if(parents == null | parents.isEmpty()) { + // reached end e.g. TS operator + return null; + } + + Operator nextOp = null; + List backtrackedExprs = null; + for(int i=0; i nextRS = findSourceRS(nextOp, backtrackedExprs); + if(nextRS != null) { + currRS = nextRS; + } + } + return currRS; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 4c5695c68a..465badaf82 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -1381,7 +1381,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } } else { // Case 3: column stats, hash aggregation, NO grouping sets - cardinality = Math.min(parentNumRows / 2, StatsUtils.safeMult(ndvProduct, parallelism)); + cardinality = Math.min(parentNumRows, StatsUtils.safeMult(ndvProduct, parallelism)); + long orgParentNumRows = getParentNumRows(gop, gop.getConf().getKeys(), conf); + cardinality = Math.min(cardinality, orgParentNumRows); if (LOG.isDebugEnabled()) { LOG.debug("[Case 3] STATS-" + gop.toString() + ": cardinality: " + cardinality); @@ -1397,7 +1399,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } } else { // Case 5: column stats, NO hash aggregation, NO grouping sets - cardinality = parentNumRows; + cardinality = Math.min(parentNumRows, getParentNumRows(gop, gop.getConf().getKeys(), conf)); if (LOG.isDebugEnabled()) { LOG.debug("[Case 5] STATS-" + gop.toString() + ": cardinality: " + cardinality); @@ -1521,6 +1523,17 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } + private long getParentNumRows(GroupByOperator op, List gbyKeys, HiveConf conf) { + if(gbyKeys == null || gbyKeys.isEmpty()) { + return op.getParentOperators().get(0).getStatistics().getNumRows(); + } + Operator parent = OperatorUtils.findSourceRS(op, gbyKeys); + if(parent != null) { + return parent.getStatistics().getNumRows(); + } + return op.getParentOperators().get(0).getStatistics().getNumRows(); + } + /** * This method does not take into account many configs used at runtime to * disable hash aggregation like HIVEMAPAGGRHASHMINREDUCTION. This method diff --git a/ql/src/test/queries/clientpositive/gby_stats_estimate.q b/ql/src/test/queries/clientpositive/gby_stats_estimate.q new file mode 100644 index 0000000000..1967228303 --- /dev/null +++ b/ql/src/test/queries/clientpositive/gby_stats_estimate.q @@ -0,0 +1,19 @@ +set hive.stats.fetch.column.stats=true; + +create table t1_uq12(i int, j int); +alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000'); +alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0'); +alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50'); + +create table t2_uq12(i2 int, j2 int); +alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000'); +alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0'); +alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1'); + + +explain select count (1) from t1_uq12,t2_uq12 + where t1_uq12.j=t2_uq12.i2 +group by t1_uq12.i, t1_uq12.j; + +drop table t1_uq12; +drop table t2_uq12; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/gby_stats_estimate.q.out b/ql/src/test/results/clientpositive/llap/gby_stats_estimate.q.out new file mode 100644 index 0000000000..5055be8dae --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/gby_stats_estimate.q.out @@ -0,0 +1,225 @@ +PREHOOK: query: create table t1_uq12(i int, j int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: create table t1_uq12(i int, j int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: create table t2_uq12(i2 int, j2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: create table t2_uq12(i2 int, j2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: explain select count (1) from t1_uq12,t2_uq12 + where t1_uq12.j=t2_uq12.i2 +group by t1_uq12.i, t1_uq12.j +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_uq12 +PREHOOK: Input: default@t2_uq12 +#### A masked pattern was here #### +POSTHOOK: query: explain select count (1) from t1_uq12,t2_uq12 + where t1_uq12.j=t2_uq12.i2 +group by t1_uq12.i, t1_uq12.j +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Input: default@t2_uq12 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1_uq12 + filterExpr: j is not null (type: boolean) + Statistics: Num rows: 10000 Data size: 79688 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: j is not null (type: boolean) + Statistics: Num rows: 9970 Data size: 79448 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: i (type: int), j (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 9970 Data size: 79448 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 9970 Data size: 79448 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int) + Select Operator + expressions: _col1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 9970 Data size: 79528 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized, llap + LLAP IO: no inputs + Map 5 + Map Operator Tree: + TableScan + alias: t2_uq12 + filterExpr: (i2 is not null and (i2 BETWEEN DynamicValue(RS_6_t1_uq12_j_min) AND DynamicValue(RS_6_t1_uq12_j_max) and in_bloom_filter(i2, DynamicValue(RS_6_t1_uq12_j_bloom_filter)))) (type: boolean) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((i2 BETWEEN DynamicValue(RS_6_t1_uq12_j_min) AND DynamicValue(RS_6_t1_uq12_j_max) and in_bloom_filter(i2, DynamicValue(RS_6_t1_uq12_j_bloom_filter))) and i2 is not null) (type: boolean) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: i2 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 99700 Data size: 797288 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9970 Data size: 159496 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: int) + Statistics: Num rows: 9970 Data size: 159496 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: bigint) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9970 Data size: 159496 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col2 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 9970 Data size: 79760 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 9970 Data size: 79760 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table t1_uq12 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: drop table t1_uq12 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: drop table t2_uq12 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: drop table t2_uq12 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12