diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index d53efbf..eeec186 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -185,8 +185,8 @@ protected abstract Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateP protected abstract Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException; - // for JOIN-RS case, it's not possible generally to merge if child has - // less key/partition columns than parents + // for JOIN-RS case, it's not possible generally to merge if child dose + // not has the same key/partition columns as parents protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException { List> parents = pJoin.getParentOperators(); @@ -194,7 +194,14 @@ protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReduc ReduceSinkDesc cRSc = cRS.getConf(); for (ReduceSinkOperator pRSNs : pRSs) { ReduceSinkDesc pRSNc = pRSNs.getConf(); - if (cRSc.getKeyCols().size() < pRSNc.getKeyCols().size()) { + + // It is not possible to delete reduce sink if distinct function + // is in group operator. Since distinct columns needs to be + // in the key for sorting. + if (cRSc.getDistinctColumnIndices() != null && cRSc.getDistinctColumnIndices().size() > 0) { + return false; + } + if (cRSc.getKeyCols().size() != pRSNc.getKeyCols().size()) { return false; } if (cRSc.getPartitionCols().size() != pRSNc.getPartitionCols().size()) { diff --git ql/src/test/queries/clientpositive/join_grp_diff_keys.q ql/src/test/queries/clientpositive/join_grp_diff_keys.q index fb110b4..dee3f3c 100644 --- ql/src/test/queries/clientpositive/join_grp_diff_keys.q +++ ql/src/test/queries/clientpositive/join_grp_diff_keys.q @@ -14,6 +14,12 @@ SELECT foo.id, count(*) as factor from WHERE foo.orders != 'blah' group by foo.id; +explain +SELECT foo.id, count(distinct foo.line_id) as factor from + foo JOIN bar ON (foo.id = bar.id) + WHERE foo.orders != 'blah' + group by foo.id; + drop table split; drop table bar; drop table foo; diff --git ql/src/test/results/clientpositive/join_grp_diff_keys.q.out ql/src/test/results/clientpositive/join_grp_diff_keys.q.out index 17688a9..37a3981 100644 --- ql/src/test/results/clientpositive/join_grp_diff_keys.q.out +++ ql/src/test/results/clientpositive/join_grp_diff_keys.q.out @@ -172,6 +172,110 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: explain +SELECT foo.id, count(distinct foo.line_id) as factor from + foo JOIN bar ON (foo.id = bar.id) + WHERE foo.orders != 'blah' + group by foo.id +PREHOOK: type: QUERY +POSTHOOK: query: explain +SELECT foo.id, count(distinct foo.line_id) as factor from + foo JOIN bar ON (foo.id = bar.id) + WHERE foo.orders != 'blah' + group by foo.id +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: foo + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((orders <> 'blah') and id is not null) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: id (type: int), line_id (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: bar + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: id (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(DISTINCT _col1) + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + PREHOOK: query: drop table split PREHOOK: type: DROPTABLE PREHOOK: Input: default@split