From 7cac825ed2c4ef8b5dfaa3ca6ed9ba4c679322a4 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Mon, 4 May 2015 22:25:12 -0700 Subject: [PATCH] HIVE-10607 : Combination of ReducesinkDedup + TopN optimization yields incorrect result if there are multiple GBY in reducer --- .../hive/ql/optimizer/LimitPushdownOptimizer.java | 9 ++- .../test/queries/clientpositive/limit_pushdown.q | 4 + .../results/clientpositive/limit_pushdown.q.out | 88 ++++++++++++++++++++ .../clientpositive/tez/limit_pushdown.q.out | 94 ++++++++++++++++++++++ 4 files changed, 194 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java index f80941e..e850550 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.LimitOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -86,6 +87,7 @@ */ public class LimitPushdownOptimizer implements Transform { + @Override public ParseContext transform(ParseContext pctx) throws SemanticException { Map opRules = new LinkedHashMap(); opRules.put(new RuleRegExp("R1", @@ -105,6 +107,7 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { private static class TopNReducer implements NodeProcessor { + @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { ReduceSinkOperator rs = null; @@ -122,6 +125,10 @@ public Object process(Node nd, Stack stack, } } if (rs != null) { + if (OperatorUtils.findOperators(rs, GroupByOperator.class).size() > 1){ + // Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See HIVE-10607 for more. + return false; + } LimitOperator limit = (LimitOperator) nd; rs.getConf().setTopN(limit.getConf().getLimit()); rs.getConf().setTopNMemoryUsage(((LimitPushdownContext) procCtx).threshold); @@ -135,7 +142,7 @@ public Object process(Node nd, Stack stack, private static class LimitPushdownContext implements NodeProcessorCtx { - private float threshold; + private final float threshold; public LimitPushdownContext(HiveConf conf) throws SemanticException { threshold = conf.getFloatVar(HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE); diff --git a/ql/src/test/queries/clientpositive/limit_pushdown.q b/ql/src/test/queries/clientpositive/limit_pushdown.q index d93e246..3940564 100644 --- a/ql/src/test/queries/clientpositive/limit_pushdown.q +++ b/ql/src/test/queries/clientpositive/limit_pushdown.q @@ -31,6 +31,10 @@ explain select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20; select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20; +explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20; +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20; + -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20; diff --git a/ql/src/test/results/clientpositive/limit_pushdown.q.out b/ql/src/test/results/clientpositive/limit_pushdown.q.out index c7ab7b3..6ace047 100644 --- a/ql/src/test/results/clientpositive/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/limit_pushdown.q.out @@ -504,6 +504,94 @@ POSTHOOK: Input: default@alltypesorc -63 19 -64 24 NULL 2932 +PREHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cdouble (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: tinyint), _col1 (type: double) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: double) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink + +PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +-46 24 +-47 22 +-48 29 +-49 26 +-50 30 +-51 21 +-52 33 +-53 22 +-54 26 +-55 29 +-56 36 +-57 35 +-58 23 +-59 31 +-60 27 +-61 25 +-62 27 +-63 19 +-64 24 +NULL 2932 PREHOOK: query: -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20 diff --git a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out index 952d7ff..7038b4d 100644 --- a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out @@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc -63 19 -64 24 NULL 2932 +PREHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cdouble (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: tinyint), _col1 (type: double) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: double) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink + +PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +-46 24 +-47 22 +-48 29 +-49 26 +-50 30 +-51 21 +-52 33 +-53 22 +-54 26 +-55 29 +-56 36 +-57 35 +-58 23 +-59 31 +-60 27 +-61 25 +-62 27 +-63 19 +-64 24 +NULL 2932 PREHOOK: query: -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20 -- 1.7.12.4 (Apple Git-37)