diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index c380a2d..b206448 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -27,6 +27,7 @@ import java.util.Map; import java.util.Stack; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExtractOperator; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; @@ -230,8 +231,13 @@ protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReduc } /** - * Current RSDedup remove/replace child RS. So always copies + * Current RSDedup remove/replace child RS. For key columns, + * sorting order, and the number of reducers, copy * more specific part of configurations of child RS to that of parent RS. + * For partitioning columns, if both child RS and parent RS have been assigned + * partitioning columns, we will choose the more general partitioning columns. + * If parent RS has not been assigned any partitioning column, we will use + * partitioning columns (if exist) of child RS. */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { @@ -239,20 +245,57 @@ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minR if (result == null) { return false; } + if (result[0] > 0) { - ArrayList childKCs = cRS.getConf().getKeyCols(); + // The sorting columns of the child RS are more specific than + // those of the parent RS. Assign sorting columns of the child RS + // to the parent RS. + List childKCs = cRS.getConf().getKeyCols(); pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS)); } - if (result[1] > 0) { - ArrayList childPCs = cRS.getConf().getPartitionCols(); - pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); + + if (result[1] < 0) { + // The partitioning columns of the parent RS are more specific than + // those of the child RS. + List childPCs = cRS.getConf().getPartitionCols(); + if (childPCs != null && !childPCs.isEmpty()) { + // If partitioning columns of the child RS are assigned, + // assign these to the partitioning columns of the parent RS. + pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); + } + } else if (result[1] > 0) { + // The partitioning columns of the child RS are more specific than + // those of the parent RS. + List parentPCs = pRS.getConf().getPartitionCols(); + if (parentPCs == null || parentPCs.isEmpty()) { + // If partitioning columns of the parent RS are not assigned, + // assign partitioning columns of the child RS to the parent RS. + ArrayList childPCs = cRS.getConf().getPartitionCols(); + pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); + } } + if (result[2] > 0) { + // The sorting order of the child RS is more specific than + // that of the parent RS. Assign the sorting order of the child RS + // to the parent RS. + if (result[0] <= 0) { + // Sorting columns of the parent RS are more specific than those of the + // child RS but Sorting order of the child RS is more specific than + // that of the parent RS. + throw new SemanticException("Sorting columns and order don't match. " + + "Try set " + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION + "=false;"); + } pRS.getConf().setOrder(cRS.getConf().getOrder()); } + if (result[3] > 0) { + // The number of reducers of the child RS is more specific than + // that of the parent RS. Assign the number of reducers of the child RS + // to the parent RS. pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } + return true; } diff --git ql/src/test/results/clientpositive/groupby2_map_skew.q.out ql/src/test/results/clientpositive/groupby2_map_skew.q.out index da7a128..a495aa6 100644 --- ql/src/test/results/clientpositive/groupby2_map_skew.q.out +++ ql/src/test/results/clientpositive/groupby2_map_skew.q.out @@ -55,8 +55,6 @@ STAGE PLANS: Map-reduce partition columns: expr: _col0 type: string - expr: _col1 - type: string tag: -1 value expressions: expr: _col2 diff --git ql/src/test/results/clientpositive/groupby_cube1.q.out ql/src/test/results/clientpositive/groupby_cube1.q.out index a52f4eb..9bb7509 100644 --- ql/src/test/results/clientpositive/groupby_cube1.q.out +++ ql/src/test/results/clientpositive/groupby_cube1.q.out @@ -452,10 +452,6 @@ STAGE PLANS: Map-reduce partition columns: expr: _col0 type: string - expr: _col1 - type: string - expr: _col2 - type: string tag: -1 value expressions: expr: _col3 diff --git ql/src/test/results/clientpositive/groupby_rollup1.q.out ql/src/test/results/clientpositive/groupby_rollup1.q.out index f120471..bc8396c 100644 --- ql/src/test/results/clientpositive/groupby_rollup1.q.out +++ ql/src/test/results/clientpositive/groupby_rollup1.q.out @@ -440,10 +440,6 @@ STAGE PLANS: Map-reduce partition columns: expr: _col0 type: string - expr: _col1 - type: string - expr: _col2 - type: string tag: -1 value expressions: expr: _col3 diff --git ql/src/test/results/clientpositive/reduce_deduplicate_extended.q.out ql/src/test/results/clientpositive/reduce_deduplicate_extended.q.out index 3297ebb..adf47ef 100644 --- ql/src/test/results/clientpositive/reduce_deduplicate_extended.q.out +++ ql/src/test/results/clientpositive/reduce_deduplicate_extended.q.out @@ -608,8 +608,6 @@ STAGE PLANS: Map-reduce partition columns: expr: _col0 type: string - expr: _col1 - type: string tag: -1 Reduce Operator Tree: Group By Operator @@ -2874,8 +2872,6 @@ STAGE PLANS: Map-reduce partition columns: expr: key type: string - expr: value - type: string tag: -1 Reduce Operator Tree: Group By Operator