diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java index 010c89e..4adf7b2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java @@ -246,7 +246,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // Create SelectDesc SelectDesc selConf = new SelectDesc(descs, colNames); - // Create Select Operator SelectOperator selOp = (SelectOperator) OperatorFactory.getAndMakeChild( selConf, selRS, rsOp); @@ -420,7 +419,6 @@ public ReduceSinkOperator getReduceSinkOp(List partitionPositions, // 1) Partition columns // 2) Bucket number column // 3) Sort columns - // 4) Null sort columns Set keyColsPosInVal = Sets.newLinkedHashSet(); ArrayList keyCols = Lists.newArrayList(); List newSortOrder = Lists.newArrayList(); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index 733620b..77771c3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -25,6 +25,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Stack; import org.apache.hadoop.hive.conf.HiveConf; @@ -50,11 +51,12 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; - -import com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * If two reducer sink operators share the same partition/sort columns and order, @@ -65,6 +67,8 @@ */ public class ReduceSinkDeDuplication extends Transform { + protected static final Logger LOG = LoggerFactory.getLogger(ReduceSinkDeDuplication.class); + private static final String RS = ReduceSinkOperator.getOperatorName(); private static final String GBY = GroupByOperator.getOperatorName(); private static final String JOIN = JoinOperator.getOperatorName(); @@ -253,7 +257,7 @@ protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReduc */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { - int[] result = checkStatus(cRS, pRS, minReducer); + int[] result = extractMergeDirections(cRS, pRS, minReducer); if (result == null) { return false; } @@ -334,7 +338,7 @@ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minR * 2. for -1, configuration of parent RS is more specific than child RS * 3. for 1, configuration of child RS is more specific than parent RS */ - private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) + private int[] extractMergeDirections(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); @@ -494,6 +498,112 @@ protected Integer checkNumReducer(int creduce, int preduce) { } return 0; } + + protected boolean aggressiveDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS, + ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { + assert cRS.getNumParent() == 1; + + ReduceSinkDesc cConf = cRS.getConf(); + ReduceSinkDesc pConf = pRS.getConf(); + List cKeys = cConf.getKeyCols(); + List pKeys = pConf.getKeyCols(); + + // Check that in the path between cRS and pRS, there are only Select operators + // i.e. the sequence must be pRS-SEL*-cRS + Operator parent = cRS.getParentOperators().get(0); + while (parent != pRS) { + assert parent.getNumParent() == 1; + if (!(parent instanceof SelectOperator)) { + return false; + } + parent = parent.getParentOperators().get(0); + } + + // If child keys are null or empty, we bail out + if (cKeys == null || cKeys.isEmpty()) { + return false; + } + // If parent keys are null or empty, we bail out + if (pKeys == null || pKeys.isEmpty()) { + return false; + } + + // Backtrack key columns of cRS to pRS + // If we cannot backtrack any of the columns, bail out + List cKeysInParentRS = ExprNodeDescUtils.backtrack(cKeys, cRS, pRS); + for (int i = 0; i < cKeysInParentRS.size(); i++) { + ExprNodeDesc pexpr = cKeysInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(cKeysInParentRS, cRS, pRS)); + + // Backtrack partition columns of cRS to pRS + // If we cannot backtrack any of the columns, bail out + List cPartitionInParentRS = ExprNodeDescUtils.backtrack( + cConf.getPartitionCols(), cRS, pRS); + for (int i = 0; i < cPartitionInParentRS.size(); i++) { + ExprNodeDesc pexpr = cPartitionInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(cPartitionInParentRS, cRS, pRS)); + + // Backtrack value columns of cRS to pRS + // If we cannot backtrack any of the columns, bail out + List cValueInParentRS = ExprNodeDescUtils.backtrack( + cConf.getValueCols(), cRS, pRS); + for (int i = 0; i < cValueInParentRS.size(); i++) { + ExprNodeDesc pexpr = cValueInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setValueCols(ExprNodeDescUtils.backtrack(cValueInParentRS, cRS, pRS)); + + // Backtrack bucket columns of cRS to pRS (if any) + // If we cannot backtrack any of the columns, bail out + if (cConf.getBucketCols() != null) { + List cBucketInParentRS = ExprNodeDescUtils.backtrack( + cConf.getBucketCols(), cRS, pRS); + for (int i = 0; i < cBucketInParentRS.size(); i++) { + ExprNodeDesc pexpr = cBucketInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setBucketCols(ExprNodeDescUtils.backtrack(cBucketInParentRS, cRS, pRS)); + } + + // Update column expression map + for (Entry e : cRS.getColumnExprMap().entrySet()) { + e.setValue(ExprNodeDescUtils.backtrack(e.getValue(), cRS, pRS)); + } + + // Replace pRS with cRS and remove operator sequence from pRS to cRS + // Recall that the sequence must be pRS-SEL*-cRS + parent = cRS.getParentOperators().get(0); + while (parent != pRS) { + dedupCtx.addRemovedOperator(parent); + parent = parent.getParentOperators().get(0); + } + dedupCtx.addRemovedOperator(pRS); + cRS.getParentOperators().clear(); + for (Operator op : pRS.getParentOperators()) { + op.replaceChild(pRS, cRS); + cRS.getParentOperators().add(op); + } + pRS.getParentOperators().clear(); + pRS.getChildOperators().clear(); + + return true; + } } static class GroupbyReducerProc extends AbsctractReducerReducerProc { @@ -601,11 +711,18 @@ public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedup ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( cRS, ReduceSinkOperator.class, dedupCtx.trustScript()); - if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { - CorrelationUtilities.replaceReduceSinkWithSelectOperator( - cRS, dedupCtx.getPctx(), dedupCtx); - pRS.getConf().setDeduplicated(true); - return true; + if (pRS != null) { + // Try extended deduplication + if (aggressiveDedup(cRS, pRS, dedupCtx)) { + return true; + } + // Normal deduplication + if (merge(cRS, pRS, dedupCtx.minReducer())) { + CorrelationUtilities.replaceReduceSinkWithSelectOperator( + cRS, dedupCtx.getPctx(), dedupCtx); + pRS.getConf().setDeduplicated(true); + return true; + } } return false; } diff --git ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out index d03bfe4..ab8f96c 100644 --- ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out +++ ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out @@ -159,9 +159,8 @@ explain insert overwrite table over1k_part_orc partition(ds="foo", t) select si, POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -178,35 +177,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -232,7 +210,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.over1k_part_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part_limit_orc partition(ds="foo", t) select si,i,b,f,t from over1k_orc where t is null or t=27 limit 10 @@ -517,9 +495,8 @@ explain insert into table over1k_part_orc partition(ds="foo", t) select si,i,b,f POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -536,35 +513,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -590,7 +546,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.over1k_part_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert into table over1k_part_limit_orc partition(ds="foo", t) select si,i,b,f,t from over1k_orc where t is null or t=27 limit 10 @@ -1336,9 +1292,8 @@ POSTHOOK: query: explain insert overwrite table over1k_part2_orc partition(ds="f POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1355,35 +1310,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -1409,7 +1343,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.over1k_part2_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part2_orc partition(ds="foo",t) select si,i,b,f,t from (select * from over1k_orc order by i limit 10) tmp where t is null or t=27 diff --git ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out index dec872a..391acff 100644 --- ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out +++ ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out @@ -1240,9 +1240,8 @@ POSTHOOK: query: explain insert overwrite table over1k_part2 partition(ds="foo", POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1259,32 +1258,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) - Reduce Operator Tree: - Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Reduce Operator Tree: Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) @@ -1312,7 +1290,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.over1k_part2 - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part2 partition(ds="foo",t) select si,i,b,f,t from (select * from over1k order by i limit 10) tmp where t is null or t=27 diff --git ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out index 8325803..ac95ec2 100644 --- ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out +++ ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out @@ -380,9 +380,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -396,28 +395,9 @@ STAGE PLANS: expressions: ROW__ID (type: struct), ds (type: string) outputColumnNames: _col0, _col3 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col3 (type: string) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col2 (type: string) - outputColumnNames: _col0, _col3 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), '_bucket_number' (type: string), _col0 (type: struct) - sort order: +++ - Map-reduce partition columns: _col3 (type: string) + key expressions: _col3 (type: string), '_bucket_number' (type: string), _col0 (type: struct) + sort order: +++ + Map-reduce partition columns: _col3 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), 'foo' (type: string), 'bar' (type: string), KEY._col3 (type: string), KEY.'_bucket_number' (type: string) @@ -442,7 +422,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds in ('2008-04-08') @@ -894,9 +874,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -910,28 +889,9 @@ STAGE PLANS: expressions: ROW__ID (type: struct), hr (type: int) outputColumnNames: _col0, _col4 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col3 (type: int) - outputColumnNames: _col0, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: '2008-04-08' (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) - sort order: ++++ - Map-reduce partition columns: '2008-04-08' (type: string), _col4 (type: int) + key expressions: '2008-04-08' (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) + sort order: ++++ + Map-reduce partition columns: '2008-04-08' (type: string), _col4 (type: int) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), 'foo' (type: string), 'bar' (type: string), '2008-04-08' (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -957,7 +917,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr>=11 @@ -1091,9 +1051,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1107,29 +1066,10 @@ STAGE PLANS: expressions: ROW__ID (type: struct), key (type: string), 'bar' (type: string), ds (type: string), hr (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) - sort order: ++++ - Map-reduce partition columns: _col3 (type: string), _col4 (type: int) - value expressions: _col1 (type: string), _col2 (type: string) + key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) + sort order: ++++ + Map-reduce partition columns: _col3 (type: string), _col4 (type: int) + value expressions: _col1 (type: string), _col2 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), VALUE._col1 (type: string), VALUE._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -1155,7 +1095,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr=11 @@ -1185,9 +1125,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1201,29 +1140,10 @@ STAGE PLANS: expressions: ROW__ID (type: struct), key (type: string), 'bar' (type: string), ds (type: string), hr (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) - sort order: ++++ - Map-reduce partition columns: _col3 (type: string), _col4 (type: int) - value expressions: _col1 (type: string), _col2 (type: string) + key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) + sort order: ++++ + Map-reduce partition columns: _col3 (type: string), _col4 (type: int) + value expressions: _col1 (type: string), _col2 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), VALUE._col1 (type: string), VALUE._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -1249,7 +1169,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr>=11 diff --git ql/src/test/results/clientpositive/reducesink_dedup.q.out ql/src/test/results/clientpositive/reducesink_dedup.q.out index b89df52..77bffff 100644 --- ql/src/test/results/clientpositive/reducesink_dedup.q.out +++ ql/src/test/results/clientpositive/reducesink_dedup.q.out @@ -10,29 +10,29 @@ distribute by 1 sort by 1 POSTHOOK: type: QUERY POSTHOOK: Input: default@part #### A masked pattern was here #### +almond azure blanched chiffon midnight +almond aquamarine dodger light gainsboro +almond antique sky peru orange +almond antique medium spring khaki +almond antique blue firebrick mint +almond azure aquamarine papaya violet +almond aquamarine yellow dodger mint +almond aquamarine floral ivory bisque +almond antique violet mint lemon +almond antique gainsboro frosted violet +almond antique olive coral navajo +almond antique misty red olive +almond antique metallic orange dim +almond antique forest lavender goldenrod +almond antique chartreuse khaki white +almond aquamarine sandy cyan gainsboro +almond aquamarine rose maroon antique +almond aquamarine midnight light salmon +almond antique violet turquoise frosted +almond antique violet chocolate turquoise +almond aquamarine pink moccasin thistle +almond aquamarine burnished black steel +almond antique salmon chartreuse burlywood +almond antique chartreuse lavender yellow almond antique burnished rose metallic almond antique burnished rose metallic -almond antique chartreuse lavender yellow -almond antique salmon chartreuse burlywood -almond aquamarine burnished black steel -almond aquamarine pink moccasin thistle -almond antique violet chocolate turquoise -almond antique violet turquoise frosted -almond aquamarine midnight light salmon -almond aquamarine rose maroon antique -almond aquamarine sandy cyan gainsboro -almond antique chartreuse khaki white -almond antique forest lavender goldenrod -almond antique metallic orange dim -almond antique misty red olive -almond antique olive coral navajo -almond antique gainsboro frosted violet -almond antique violet mint lemon -almond aquamarine floral ivory bisque -almond aquamarine yellow dodger mint -almond azure aquamarine papaya violet -almond antique blue firebrick mint -almond antique medium spring khaki -almond antique sky peru orange -almond aquamarine dodger light gainsboro -almond azure blanched chiffon midnight diff --git ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out index a90e3f6..9a72586 100644 --- ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out +++ ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out @@ -169,7 +169,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -185,28 +184,16 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Execution mode: vectorized - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -557,7 +544,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -573,28 +559,16 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Execution mode: vectorized - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -1418,7 +1392,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1434,28 +1407,16 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Execution mode: vectorized - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE diff --git ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out index 723e819..2f88148 100644 --- ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out +++ ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out @@ -1329,7 +1329,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1345,25 +1344,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Reducer 2 Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE