diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java index 010c89e..071b42f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java @@ -221,6 +221,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // Create ReduceSink operator ReduceSinkOperator rsOp = getReduceSinkOp(partitionPositions, sortPositions, sortOrder, sortNullOrder, allRSCols, bucketColumns, numBuckets, fsParent, fsOp.getConf().getWriteType()); + rsOp.getConf().setAdditionalDynPartSort(true); List descs = new ArrayList(allRSCols.size()); List colNames = new ArrayList(); @@ -420,7 +421,6 @@ public ReduceSinkOperator getReduceSinkOp(List partitionPositions, // 1) Partition columns // 2) Bucket number column // 3) Sort columns - // 4) Null sort columns Set keyColsPosInVal = Sets.newLinkedHashSet(); ArrayList keyCols = Lists.newArrayList(); List newSortOrder = Lists.newArrayList(); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index 733620b..54f1509 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -25,6 +25,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Stack; import org.apache.hadoop.hive.conf.HiveConf; @@ -50,11 +51,12 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; - -import com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * If two reducer sink operators share the same partition/sort columns and order, @@ -65,6 +67,8 @@ */ public class ReduceSinkDeDuplication extends Transform { + protected static final Logger LOG = LoggerFactory.getLogger(ReduceSinkDeDuplication.class); + private static final String RS = ReduceSinkOperator.getOperatorName(); private static final String GBY = GroupByOperator.getOperatorName(); private static final String JOIN = JoinOperator.getOperatorName(); @@ -253,7 +257,7 @@ protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReduc */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { - int[] result = checkStatus(cRS, pRS, minReducer); + int[] result = extractMergeDirections(cRS, pRS, minReducer); if (result == null) { return false; } @@ -334,7 +338,7 @@ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minR * 2. for -1, configuration of parent RS is more specific than child RS * 3. for 1, configuration of child RS is more specific than parent RS */ - private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) + private int[] extractMergeDirections(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); @@ -494,6 +498,133 @@ protected Integer checkNumReducer(int creduce, int preduce) { } return 0; } + + protected boolean checkValidDynPartSortDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS) { + assert cRS.getNumParent() == 1; + Operator parent = cRS.getParentOperators().get(0); + while (parent != pRS) { + assert parent.getNumParent() == 1; + if (!(parent instanceof SelectOperator)) { + return false; + } + parent = parent.getParentOperators().get(0); + } + return true; + } + + protected boolean processDynPartSortDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS, + ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { + boolean possible = checkDynPartSortDedupPossible(cRS, pRS); + if (!possible) { + return false; + } + + // Backtrack key columns of cRS to pRS + List childKCs = cRS.getConf().getKeyCols(); + cRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS)); + // Backtrack partition columns of cRS to pRS + ArrayList childPCs = cRS.getConf().getPartitionCols(); + cRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); + // Backtrack value columns of cRS to pRS + ArrayList childVCs = cRS.getConf().getValueCols(); + cRS.getConf().setValueCols(ExprNodeDescUtils.backtrack(childVCs, cRS, pRS)); + // Update column expression map + for (Entry e : cRS.getColumnExprMap().entrySet()) { + e.setValue(ExprNodeDescUtils.backtrack(e.getValue(), cRS, pRS)); + } + + // Replace pRS with cRS and remove operator sequence from pRS to cRS + Operator parent = cRS.getParentOperators().get(0); + while (parent != pRS) { + dedupCtx.addRemovedOperator(parent); + parent = parent.getParentOperators().get(0); + } + dedupCtx.addRemovedOperator(pRS); + cRS.getParentOperators().clear(); + for (Operator op : pRS.getParentOperators()) { + op.replaceChild(pRS, cRS); + cRS.getParentOperators().add(op); + } + pRS.getParentOperators().clear(); + pRS.getChildOperators().clear(); + + return true; + } + + // We can merge if the list of expressions of the parent is a sublist in the list of + // expressions of the child. Constant elements are ignored for validating this condition. + // Further, the sorting order for those expressions that are in the keys should match. + private boolean checkDynPartSortDedupPossible(ReduceSinkOperator cRS, ReduceSinkOperator pRS) + throws SemanticException { + ReduceSinkDesc cConf = cRS.getConf(); + ReduceSinkDesc pConf = pRS.getConf(); + List cKeys = cConf.getKeyCols(); + List pKeys = pConf.getKeyCols(); + + // If child keys are null or empty, we bail out + if (cKeys == null || cKeys.isEmpty()) { + return false; + } + // If parent keys are null or empty, we bail out + if (pKeys == null || pKeys.isEmpty()) { + return false; + } + + List cKeysInParentRS = ExprNodeDescUtils.backtrack(cKeys, cRS, pRS); + int posChildRS = -1; + for (int i = 0; i < pKeys.size(); i++) { + ExprNodeDesc pexpr = pKeys.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + if (pexpr instanceof ExprNodeConstantDesc) { + // It is a constant, we can continue with the next expression + continue; + } + if (posChildRS == -1) { + // We need to find first element + for (int j = 0; j < cKeysInParentRS.size(); j++) { + if (cKeysInParentRS.get(j) == pexpr) { + posChildRS = j; + if (cConf.getOrder().charAt(j) != pConf.getOrder().charAt(i) || + cConf.getNullOrder().charAt(j) != pConf.getNullOrder().charAt(i)) { + // Order is different, we bail out + return false; + } + break; + } + } + if (posChildRS == -1) { + // We could not find the expression: different keys, + // thus we bail out + return false; + } + } else { + for (int j = posChildRS + 1; j < cKeysInParentRS.size(); j++) { + if (cKeysInParentRS.get(j) instanceof ExprNodeConstantDesc) { + continue; + } + if (cKeysInParentRS.get(j) == pexpr) { + posChildRS = j; + if (cConf.getOrder().charAt(j) != pConf.getOrder().charAt(i) || + cConf.getNullOrder().charAt(j) != pConf.getNullOrder().charAt(i)) { + // Order is different, we bail out + return false; + } + break; + } else { + // The key expression in pRS is not a sublist of the key + // expression in cRS, thus we bail out + return false; + } + } + } + } + + return true; + } + } static class GroupbyReducerProc extends AbsctractReducerReducerProc { @@ -601,11 +732,22 @@ public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedup ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( cRS, ReduceSinkOperator.class, dedupCtx.trustScript()); - if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { - CorrelationUtilities.replaceReduceSinkWithSelectOperator( - cRS, dedupCtx.getPctx(), dedupCtx); - pRS.getConf().setDeduplicated(true); - return true; + if (pRS != null) { + // If it is a RS introduced by sorted dynamic partition optimizer, + // we can try extended deduplication + if (cRS.getConf().isAdditionalDynPartSort() && + checkValidDynPartSortDedup(cRS, pRS)) { + if (processDynPartSortDedup(cRS, pRS, dedupCtx)) { + return true; + } + } + // Normal deduplication + if (merge(cRS, pRS, dedupCtx.minReducer())) { + CorrelationUtilities.replaceReduceSinkWithSelectOperator( + cRS, dedupCtx.getPctx(), dedupCtx); + pRS.getConf().setDeduplicated(true); + return true; + } } return false; } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java index d7e404c..c3d426c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java @@ -116,6 +116,9 @@ private ReducerTraits(int trait) { // whether this RS is deduplicated private transient boolean isDeduplicated = false; + // whether this RS was introduced by Sorted Dynamic Partition optimizer + private transient boolean additionalDynPartSort = false; + // used by spark mode to decide whether global order is needed private transient boolean hasOrderBy = false; @@ -179,6 +182,7 @@ public Object clone() { desc.setSkipTag(skipTag); desc.reduceTraits = reduceTraits.clone(); desc.setDeduplicated(isDeduplicated); + desc.setAdditionalDynPartSort(additionalDynPartSort); desc.setHasOrderBy(hasOrderBy); if (vectorDesc != null) { throw new RuntimeException("Clone with vectorization desc not supported"); @@ -482,6 +486,14 @@ public void setDeduplicated(boolean isDeduplicated) { this.isDeduplicated = isDeduplicated; } + public boolean isAdditionalDynPartSort() { + return additionalDynPartSort; + } + + public void setAdditionalDynPartSort(boolean additionalDynPartSort) { + this.additionalDynPartSort = additionalDynPartSort; + } + public boolean hasOrderBy() { return hasOrderBy; } diff --git ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out index d03bfe4..ab8f96c 100644 --- ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out +++ ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out @@ -159,9 +159,8 @@ explain insert overwrite table over1k_part_orc partition(ds="foo", t) select si, POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -178,35 +177,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -232,7 +210,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.over1k_part_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part_limit_orc partition(ds="foo", t) select si,i,b,f,t from over1k_orc where t is null or t=27 limit 10 @@ -517,9 +495,8 @@ explain insert into table over1k_part_orc partition(ds="foo", t) select si,i,b,f POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -536,35 +513,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -590,7 +546,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.over1k_part_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert into table over1k_part_limit_orc partition(ds="foo", t) select si,i,b,f,t from over1k_orc where t is null or t=27 limit 10 @@ -1336,9 +1292,8 @@ POSTHOOK: query: explain insert overwrite table over1k_part2_orc partition(ds="f POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1355,35 +1310,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -1409,7 +1343,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.over1k_part2_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part2_orc partition(ds="foo",t) select si,i,b,f,t from (select * from over1k_orc order by i limit 10) tmp where t is null or t=27 diff --git ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out index dec872a..391acff 100644 --- ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out +++ ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out @@ -1240,9 +1240,8 @@ POSTHOOK: query: explain insert overwrite table over1k_part2 partition(ds="foo", POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1259,32 +1258,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) - Reduce Operator Tree: - Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Reduce Operator Tree: Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) @@ -1312,7 +1290,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.over1k_part2 - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part2 partition(ds="foo",t) select si,i,b,f,t from (select * from over1k order by i limit 10) tmp where t is null or t=27 diff --git ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out index 8325803..ac95ec2 100644 --- ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out +++ ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out @@ -380,9 +380,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -396,28 +395,9 @@ STAGE PLANS: expressions: ROW__ID (type: struct), ds (type: string) outputColumnNames: _col0, _col3 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col3 (type: string) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col2 (type: string) - outputColumnNames: _col0, _col3 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), '_bucket_number' (type: string), _col0 (type: struct) - sort order: +++ - Map-reduce partition columns: _col3 (type: string) + key expressions: _col3 (type: string), '_bucket_number' (type: string), _col0 (type: struct) + sort order: +++ + Map-reduce partition columns: _col3 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), 'foo' (type: string), 'bar' (type: string), KEY._col3 (type: string), KEY.'_bucket_number' (type: string) @@ -442,7 +422,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds in ('2008-04-08') @@ -894,9 +874,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -910,28 +889,9 @@ STAGE PLANS: expressions: ROW__ID (type: struct), hr (type: int) outputColumnNames: _col0, _col4 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col3 (type: int) - outputColumnNames: _col0, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: '2008-04-08' (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) - sort order: ++++ - Map-reduce partition columns: '2008-04-08' (type: string), _col4 (type: int) + key expressions: '2008-04-08' (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) + sort order: ++++ + Map-reduce partition columns: '2008-04-08' (type: string), _col4 (type: int) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), 'foo' (type: string), 'bar' (type: string), '2008-04-08' (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -957,7 +917,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr>=11 @@ -1091,9 +1051,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1107,29 +1066,10 @@ STAGE PLANS: expressions: ROW__ID (type: struct), key (type: string), 'bar' (type: string), ds (type: string), hr (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) - sort order: ++++ - Map-reduce partition columns: _col3 (type: string), _col4 (type: int) - value expressions: _col1 (type: string), _col2 (type: string) + key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) + sort order: ++++ + Map-reduce partition columns: _col3 (type: string), _col4 (type: int) + value expressions: _col1 (type: string), _col2 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), VALUE._col1 (type: string), VALUE._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -1155,7 +1095,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr=11 @@ -1185,9 +1125,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1201,29 +1140,10 @@ STAGE PLANS: expressions: ROW__ID (type: struct), key (type: string), 'bar' (type: string), ds (type: string), hr (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Reduce Output Operator - key expressions: _col0 (type: struct) - sort order: + - value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) - sort order: ++++ - Map-reduce partition columns: _col3 (type: string), _col4 (type: int) - value expressions: _col1 (type: string), _col2 (type: string) + key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct) + sort order: ++++ + Map-reduce partition columns: _col3 (type: string), _col4 (type: int) + value expressions: _col1 (type: string), _col2 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct), VALUE._col1 (type: string), VALUE._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -1249,7 +1169,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr>=11