diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index b25bcf01a3..5e00a21566 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -173,24 +173,14 @@ protected abstract Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateP protected abstract Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException; + + protected boolean isEmptyKeyAggregate(final GroupByOperator gOp) { + return gOp.getConf().getKeys().isEmpty(); + } } static class GroupbyReducerProc extends AbsctractReducerReducerProc { - // given a group by operator this determines if that group by belongs to semi-join branch - // note that this works only for second last group by in semi-join branch (X-GB-RS-GB-RS) - private boolean isSemiJoinBranch(final GroupByOperator gOp, ReduceSinkDeduplicateProcCtx dedupCtx) { - for(int i=0; i start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); GroupByOperator pGBY = CorrelationUtilities.findPossibleParent( @@ -241,6 +234,20 @@ public Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, } return false; } + + // given a group by operator this determines if that group by belongs to semi-join branch + // note that this works only for second last group by in semi-join branch (X-GB-RS-GB-RS) + private boolean isSemiJoinBranch(final GroupByOperator gOp, ReduceSinkDeduplicateProcCtx dedupCtx) { + for(int i=0; i start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( start, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null && ReduceSinkDeDuplicationUtils.merge(cRS, pRS, dedupCtx.minReducer())) { - if (dedupCtx.getPctx().getConf().getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { - return false; - } CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; diff --git a/ql/src/test/queries/clientpositive/clusterctas.q b/ql/src/test/queries/clientpositive/clusterctas.q new file mode 100644 index 0000000000..d4e45e0194 --- /dev/null +++ b/ql/src/test/queries/clientpositive/clusterctas.q @@ -0,0 +1,12 @@ +--! qt:dataset:src + +set hive.cbo.enable=false; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +EXPLAIN +CREATE TABLE x STORED AS ORC TBLPROPERTIES('transactional'='true') AS +SELECT * FROM SRC x CLUSTER BY x.key; +CREATE TABLE x STORED AS ORC TBLPROPERTIES('transactional'='true') AS +SELECT * FROM SRC x CLUSTER BY x.key; +DROP TABLE x; diff --git a/ql/src/test/results/clientpositive/llap/clusterctas.q.out b/ql/src/test/results/clientpositive/llap/clusterctas.q.out new file mode 100644 index 0000000000..40ceee215f --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/clusterctas.q.out @@ -0,0 +1,145 @@ +PREHOOK: query: EXPLAIN +CREATE TABLE x STORED AS ORC TBLPROPERTIES('transactional'='true') AS +SELECT * FROM SRC x CLUSTER BY x.key +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +PREHOOK: Output: database:default +PREHOOK: Output: default@x +POSTHOOK: query: EXPLAIN +CREATE TABLE x STORED AS ORC TBLPROPERTIES('transactional'='true') AS +SELECT * FROM SRC x CLUSTER BY x.key +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-4 depends on stages: Stage-0, Stage-2 + Stage-3 depends on stages: Stage-4 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: a + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.x + Write Type: INSERT + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: col1, col2 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: compute_stats(col1, 'hll'), compute_stats(col2, 'hll') + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: struct), _col1 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-4 + Create Table + columns: key string, value string + name: default.x + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde name: org.apache.hadoop.hive.ql.io.orc.OrcSerde + table properties: + transactional true + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: key, value + Column Types: string, string + Table: default.x + + Stage: Stage-0 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + Write Type: INSERT + +PREHOOK: query: CREATE TABLE x STORED AS ORC TBLPROPERTIES('transactional'='true') AS +SELECT * FROM SRC x CLUSTER BY x.key +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +PREHOOK: Output: database:default +PREHOOK: Output: default@x +POSTHOOK: query: CREATE TABLE x STORED AS ORC TBLPROPERTIES('transactional'='true') AS +SELECT * FROM SRC x CLUSTER BY x.key +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x +POSTHOOK: Lineage: x.key SIMPLE [(src)x.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: x.value SIMPLE [(src)x.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: DROP TABLE x +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@x +PREHOOK: Output: default@x +POSTHOOK: query: DROP TABLE x +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@x +POSTHOOK: Output: default@x