Index: ql/src/test/results/clientpositive/groupby_sort_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_1.q.out (revision 1407376) +++ ql/src/test/results/clientpositive/groupby_sort_1.q.out (working copy) @@ -31,14 +31,14 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T1 GROUP BY key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -234,12 +234,12 @@ POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] -PREHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +PREHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val PREHOOK: type: QUERY -POSTHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +POSTHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val @@ -935,13 +935,13 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key @@ -3772,7 +3772,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -4018,13 +4018,13 @@ 7 1 8 2 PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val @@ -4375,13 +4375,13 @@ POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 @@ -5645,7 +5645,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -6076,7 +6076,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: _col0 type: string Index: ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (revision 1407376) +++ ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (working copy) @@ -31,14 +31,14 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T1 GROUP BY key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -234,12 +234,12 @@ POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] -PREHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +PREHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val PREHOOK: type: QUERY -POSTHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +POSTHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val @@ -1004,13 +1004,13 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key @@ -4183,7 +4183,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -4493,13 +4493,13 @@ 7 1 8 2 PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val @@ -4850,13 +4850,13 @@ POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 @@ -6121,7 +6121,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -6586,7 +6586,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: _col0 type: string Index: ql/src/test/results/clientpositive/groupby_sort_5.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_5.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_5.q.out (working copy) @@ -0,0 +1,627 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE outputTbl1(key STRING, val STRING, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl1(key STRING, val STRING, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by, since the +-- sorting columns and grouping columns match, and all the bucketing columns +-- are part of sorting columns +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by, since the +-- sorting columns and grouping columns match, and all the bucketing columns +-- are part of sorting columns +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: UDFToInteger(_col2) + type: int + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl1 ORDER BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl1 ORDER BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: DROP TABLE T1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: DROP TABLE T1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val, key) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val, key) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by, since the +-- sorting columns and grouping columns match, and all the bucketing columns +-- are part of sorting columns +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by, since the +-- sorting columns and grouping columns match, and all the bucketing columns +-- are part of sorting columns +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: UDFToInteger(_col2) + type: int + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl1 ORDER BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl1 ORDER BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: DROP TABLE T1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: DROP TABLE T1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE outputTbl2(key STRING, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl2(key STRING, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl2 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should not be converted to a map-side group by, since although the +-- sorting columns and grouping columns match, all the bucketing columns +-- are not part of sorting columns. However, no hash map aggregation is required +-- on the mapside. +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should not be converted to a map-side group by, since although the +-- sorting columns and grouping columns match, all the bucketing columns +-- are not part of sorting columns. However, no hash map aggregation is required +-- on the mapside. +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl2 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl2 ORDER BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl2 ORDER BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: DROP TABLE T1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: DROP TABLE T1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] Index: ql/src/test/results/clientpositive/groupby_sort_4.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_4.q.out (working copy) @@ -0,0 +1,330 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE outputTbl1(key STRING, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl1(key STRING, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should not be converted to a map-side group by. +-- However, there should no hash-based aggregation on the map-side +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should not be converted to a map-side group by. +-- However, there should no hash-based aggregation on the map-side +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl1 ORDER BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl1 ORDER BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: CREATE TABLE outputTbl2(key STRING, val STRING, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl2(key STRING, val STRING, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl2 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should not be converted to a map-side group by. +-- Hash-based aggregations should be performed on the map-side +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should not be converted to a map-side group by. +-- Hash-based aggregations should be performed on the map-side +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: UDFToInteger(_col2) + type: int + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl2 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl2 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl2 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl2 ORDER BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl2 ORDER BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 Index: ql/src/test/results/clientpositive/groupby_sort_3.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_3.q.out (working copy) @@ -0,0 +1,280 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE outputTbl1(key string, val string, cnt int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl1(key string, val string, cnt int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: UDFToInteger(_col2) + type: int + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl1 ORDER BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl1 ORDER BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: CREATE TABLE outputTbl2(key string, cnt int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl2(key string, cnt int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl2 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl2 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl2 ORDER BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl2 ORDER BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/groupby_sort_2.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_2.q.out (working copy) @@ -0,0 +1,166 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE outputTbl1(val string, cnt int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl1(val string, cnt int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should not be converted to a map-side group by even though the group by key +-- matches the sorted key. Adding a order by at the end to make the test results deterministic +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should not be converted to a map-side group by even though the group by key +-- matches the sorted key. Adding a order by at the end to make the test results deterministic +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: val + type: string + outputColumnNames: val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl1 ORDER BY val +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl1 ORDER BY val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +11 1 +12 1 +13 1 +17 1 +18 1 +28 1 Index: ql/src/test/queries/clientpositive/groupby_sort_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_1.q (revision 1407376) +++ ql/src/test/queries/clientpositive/groupby_sort_1.q (working copy) @@ -14,7 +14,7 @@ CREATE TABLE outputTbl1(key int, cnt int); -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -27,7 +27,7 @@ CREATE TABLE outputTbl2(key1 int, key2 string, cnt int); --- no map-side group by even if the group by key is a superset of skewed key +-- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val; @@ -60,7 +60,7 @@ CREATE TABLE outputTbl3(key1 int, key2 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; @@ -188,7 +188,7 @@ SELECT * FROM outputTbl1 ORDER BY key; -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; @@ -201,7 +201,7 @@ CREATE TABLE outputTbl5(key1 int, key2 int, key3 string, key4 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; Index: ql/src/test/queries/clientpositive/groupby_sort_5.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_5.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_5.q (working copy) @@ -0,0 +1,75 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +CREATE TABLE outputTbl1(key STRING, val STRING, cnt INT); + +-- The plan should be converted to a map-side group by, since the +-- sorting columns and grouping columns match, and all the bucketing columns +-- are part of sorting columns +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +SELECT * FROM outputTbl1 ORDER BY key, val; + +DROP TABLE T1; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val, key) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +-- The plan should be converted to a map-side group by, since the +-- sorting columns and grouping columns match, and all the bucketing columns +-- are part of sorting columns +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +SELECT * FROM outputTbl1 ORDER BY key, val; + +DROP TABLE T1; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (val) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +CREATE TABLE outputTbl2(key STRING, cnt INT); + +-- The plan should not be converted to a map-side group by, since although the +-- sorting columns and grouping columns match, all the bucketing columns +-- are not part of sorting columns. However, no hash map aggregation is required +-- on the mapside. +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key; + +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key; + +SELECT * FROM outputTbl2 ORDER BY key; + +DROP TABLE T1; Index: ql/src/test/queries/clientpositive/groupby_sort_2.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_2.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_2.q (working copy) @@ -0,0 +1,25 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (val) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +CREATE TABLE outputTbl1(val string, cnt int); + +-- The plan should not be converted to a map-side group by even though the group by key +-- matches the sorted key. Adding a order by at the end to make the test results deterministic +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val; + +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val; + +SELECT * FROM outputTbl1 ORDER BY val; Index: ql/src/test/queries/clientpositive/groupby_sort_3.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_3.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_3.q (working copy) @@ -0,0 +1,36 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +CREATE TABLE outputTbl1(key string, val string, cnt int); + +-- The plan should be converted to a map-side group by +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +SELECT * FROM outputTbl1 ORDER BY key, val; + +CREATE TABLE outputTbl2(key string, cnt int); + +-- The plan should be converted to a map-side group by +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key; + +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, count(1) FROM T1 GROUP BY key; + +SELECT * FROM outputTbl2 ORDER BY key; Index: ql/src/test/queries/clientpositive/groupby_sort_4.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_4.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_4.q (working copy) @@ -0,0 +1,38 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +CREATE TABLE outputTbl1(key STRING, cnt INT); + +-- The plan should not be converted to a map-side group by. +-- However, there should no hash-based aggregation on the map-side +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, count(1) FROM T1 GROUP BY key; + +INSERT OVERWRITE TABLE outputTbl1 +SELECT key, count(1) FROM T1 GROUP BY key; + +SELECT * FROM outputTbl1 ORDER BY key; + +CREATE TABLE outputTbl2(key STRING, val STRING, cnt INT); + +-- The plan should not be converted to a map-side group by. +-- Hash-based aggregations should be performed on the map-side +EXPLAIN +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +INSERT OVERWRITE TABLE outputTbl2 +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +SELECT * FROM outputTbl2 ORDER BY key, val; Index: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (revision 1407376) +++ ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (working copy) @@ -15,7 +15,7 @@ CREATE TABLE outputTbl1(key int, cnt int); -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -28,7 +28,7 @@ CREATE TABLE outputTbl2(key1 int, key2 string, cnt int); --- no map-side group by even if the group by key is a superset of skewed key +-- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val; @@ -61,7 +61,7 @@ CREATE TABLE outputTbl3(key1 int, key2 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; @@ -189,7 +189,7 @@ SELECT * FROM outputTbl1 ORDER BY key; -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; @@ -202,7 +202,7 @@ CREATE TABLE outputTbl5(key1 int, key2 int, key3 string, key4 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (revision 1407376) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (working copy) @@ -84,26 +84,26 @@ if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEGROUPBYSKEW)) { // process group-by pattern opRules.put(new RuleRegExp("R1", - GroupByOperator.getOperatorName() + "%" + - ReduceSinkOperator.getOperatorName() + "%" + - GroupByOperator.getOperatorName() + "%"), - getMapSortedGroupbyProc(pctx)); + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%"), + getMapSortedGroupbyProc(pctx)); } else { // If hive.groupby.skewindata is set to true, the operator tree is as below opRules.put(new RuleRegExp("R2", - GroupByOperator.getOperatorName() + "%" + - ReduceSinkOperator.getOperatorName() + "%" + - GroupByOperator.getOperatorName() + "%" + - ReduceSinkOperator.getOperatorName() + "%" + - GroupByOperator.getOperatorName() + "%"), - getMapSortedGroupbySkewProc(pctx)); + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%"), + getMapSortedGroupbySkewProc(pctx)); } // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = - new DefaultRuleDispatcher(getDefaultProc(), opRules, - new GroupByOptimizerContext(conf)); + new DefaultRuleDispatcher(getDefaultProc(), opRules, + new GroupByOptimizerContext(conf)); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes @@ -118,7 +118,7 @@ return new NodeProcessor() { @Override public Object process(Node nd, Stack stack, - NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { return null; } }; @@ -136,6 +136,10 @@ NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH }; + private enum ColumnOrderMatch { + NO_MATCH, PREFIX_COL1_MATCH, PREFIX_COL2_MATCH, COMPLETE_MATCH + }; + /** * SortGroupByProcessor. * @@ -150,8 +154,8 @@ // Check if the group by operator has already been processed protected boolean checkGroupByOperatorProcessed( - GroupByOptimizerContext groupBySortOptimizerContext, - GroupByOperator groupByOp) { + GroupByOptimizerContext groupBySortOptimizerContext, + GroupByOperator groupByOp) { // The group by operator has already been processed if (groupBySortOptimizerContext.getListGroupByOperatorsProcessed().contains(groupByOp)) { @@ -163,21 +167,19 @@ } protected void processGroupBy(GroupByOptimizerContext ctx, - Stack stack, - GroupByOperator groupByOp, - int depth) throws SemanticException { + Stack stack, + GroupByOperator groupByOp, + int depth) throws SemanticException { HiveConf hiveConf = ctx.getConf(); GroupByOptimizerSortMatch match = checkSortGroupBy(stack, groupByOp); boolean useMapperSort = - HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT); + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT); - if (useMapperSort) { - if (match == GroupByOptimizerSortMatch.COMPLETE_MATCH) { - convertGroupByMapSideSortedGroupBy(groupByOp, depth); - } + if (useMapperSort && (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { + convertGroupByMapSideSortedGroupBy(groupByOp, depth); } else if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) || - (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { + (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { groupByOp.getConf().setBucketGroup(true); } } @@ -188,7 +190,7 @@ // GBY,RS,GBY... (top to bottom) GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 3); - GroupByOptimizerContext ctx = (GroupByOptimizerContext)procCtx; + GroupByOptimizerContext ctx = (GroupByOptimizerContext) procCtx; if (!checkGroupByOperatorProcessed(ctx, groupByOp)) { processGroupBy(ctx, stack, groupByOp, 2); @@ -199,8 +201,8 @@ // Should this group by be converted to a map-side group by, because the grouping keys for // the base table for the group by matches the skewed keys protected GroupByOptimizerSortMatch checkSortGroupBy(Stack stack, - GroupByOperator groupByOp) - throws SemanticException { + GroupByOperator groupByOp) + throws SemanticException { // if this is not a HASH groupby, return if (groupByOp.getConf().getMode() != GroupByDesc.Mode.HASH) { @@ -226,7 +228,7 @@ } // currOp now points to the top-most tablescan operator - TableScanOperator tableScanOp = (TableScanOperator)currOp; + TableScanOperator tableScanOp = (TableScanOperator) currOp; int stackPos = 0; assert stack.get(0) == tableScanOp; @@ -241,11 +243,11 @@ while (currOp != groupByOp) { Operator processOp = currOp; Set newConstantCols = new HashSet(); - currOp = (Operator)(stack.get(++stackPos)); + currOp = (Operator) (stack.get(++stackPos)); // Filters don't change the column names - so, no need to do anything for them if (processOp instanceof SelectOperator) { - SelectOperator selectOp = (SelectOperator)processOp; + SelectOperator selectOp = (SelectOperator) processOp; SelectDesc selectDesc = selectOp.getConf(); if (selectDesc.isSelStarNoCompute()) { @@ -264,7 +266,7 @@ ExprNodeDesc selectColList = selectDesc.getColList().get(pos); if (selectColList instanceof ExprNodeColumnDesc) { String newValue = - tableColsMapping.get(((ExprNodeColumnDesc) selectColList).getColumn()); + tableColsMapping.get(((ExprNodeColumnDesc) selectColList).getColumn()); tableColsMapping.put(outputColumnName, newValue); } else { @@ -287,7 +289,7 @@ // the sorting property is not obeyed for (ExprNodeDesc expr : groupByOp.getConf().getKeys()) { if (expr instanceof ExprNodeColumnDesc) { - String groupByKeyColumn = ((ExprNodeColumnDesc)expr).getColumn(); + String groupByKeyColumn = ((ExprNodeColumnDesc) expr).getColumn(); // ignore if it is a constant if (constantCols.contains(groupByKeyColumn)) { continue; @@ -303,7 +305,7 @@ } // Constants and nulls are OK else if ((expr instanceof ExprNodeConstantDesc) || - (expr instanceof ExprNodeNullDesc)) { + (expr instanceof ExprNodeNullDesc)) { continue; } else { return GroupByOptimizerSortMatch.NO_MATCH; @@ -312,17 +314,18 @@ if (!table.isPartitioned()) { List sortCols = Utilities.getColumnNamesFromSortCols(table.getSortCols()); - return matchSortColumns(groupByCols, sortCols); + List bucketCols = table.getBucketCols(); + return matchBucketSortCols(groupByCols, bucketCols, sortCols); } else { PrunedPartitionList partsList = null; try { partsList = pGraphContext.getOpToPartList().get(tableScanOp); if (partsList == null) { partsList = PartitionPruner.prune(table, - pGraphContext.getOpToPartPruner().get(tableScanOp), - pGraphContext.getConf(), - table.getTableName(), - pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartPruner().get(tableScanOp), + pGraphContext.getConf(), + table.getTableName(), + pGraphContext.getPrunedPartitions()); pGraphContext.getOpToPartList().put(tableScanOp, partsList); } } catch (HiveException e) { @@ -333,7 +336,8 @@ GroupByOptimizerSortMatch currentMatch = GroupByOptimizerSortMatch.COMPLETE_MATCH; for (Partition part : partsList.getNotDeniedPartns()) { List sortCols = part.getSortColNames(); - GroupByOptimizerSortMatch match = matchSortColumns(groupByCols, sortCols); + List bucketCols = part.getBucketCols(); + GroupByOptimizerSortMatch match = matchBucketSortCols(groupByCols, bucketCols, sortCols); if (match == GroupByOptimizerSortMatch.NO_MATCH) { return match; } @@ -346,34 +350,103 @@ } } + /* + * Return how the list of columns passed in match. + * Return NO_MATCH if either of the list is empty or null, or if there is a mismatch. + * For eg: ([], []), ([], ["a"]), (["a"],["b"]) and (["a", "b"], ["a","c"]) return NO_MATCH + * + * Return COMPLETE_MATCH if both the lists are non-empty and are same + * Return PREFIX_COL1_MATCH if list1 is a strict subset of list2 and + * return PREFIX_COL2_MATCH if list2 is a strict subset of list1 + * + * For eg: (["a"], ["a"]), (["a"], ["a", "b"]) and (["a", "b"], ["a"]) return + * COMPLETE_MATCH, PREFIX_COL1_MATCH and PREFIX_COL2_MATCH respectively. + */ + private ColumnOrderMatch matchColumnOrder(List cols1, List cols2) { + int numCols1 = cols1 == null ? 0 : cols1.size(); + int numCols2 = cols2 == null ? 0 : cols2.size(); + + if (numCols1 == 0 || numCols2 == 0) { + return ColumnOrderMatch.NO_MATCH; + } + + for (int pos = 0; pos < Math.min(numCols1, numCols2); pos++) { + if (!cols1.get(pos).equals(cols2.get(pos))) { + return ColumnOrderMatch.NO_MATCH; + } + } + + return (numCols1 == numCols2) ? + ColumnOrderMatch.COMPLETE_MATCH : + ((numCols1 < numCols2) ? ColumnOrderMatch.PREFIX_COL1_MATCH : + ColumnOrderMatch.PREFIX_COL2_MATCH); + } + /** - * Given the group by keys, sort columns, this method + * Given the group by keys, bucket columns and sort columns, this method * determines if we can use sorted group by or not. - * We can use map-side sort group by group by columns match the sorted columns - * in exactly the same order. * * @param groupByCols + * @param bucketCols * @param sortCols * @return * @throws SemanticException */ - private GroupByOptimizerSortMatch matchSortColumns( - List groupByCols, - List sortCols) throws SemanticException { + private GroupByOptimizerSortMatch matchBucketSortCols( + List groupByCols, + List bucketCols, + List sortCols) throws SemanticException { - if (sortCols == null || sortCols.size() == 0) { + /* + * >> Super set of + * If the grouping columns are a,b,c and the sorting columns are a,b + * grouping columns >> sorting columns + * (or grouping columns are a superset of sorting columns) + * + * Similarly << means subset of + * + * No intersection between Sort Columns and BucketCols: + * + * 1. Sort Cols = Group By Cols ---> Partial Match + * 2. Group By Cols >> Sort By Cols --> No Match + * 3. Group By Cols << Sort By Cols --> Partial Match + * + * BucketCols <= SortCols (bucket columns is either same or a prefix of sort columns) + * + * 1. Sort Cols = Group By Cols ---> Complete Match + * 2. Group By Cols >> Sort By Cols --> No Match + * 3. Group By Cols << Sort By Cols --> Complete Match if Group By Cols >= BucketCols + * --> Partial Match otherwise + * + * BucketCols >> SortCols (bucket columns is a superset of sorting columns) + * + * 1. group by cols <= sort cols --> partial match + * 2. group by cols >> sort cols --> no match + * + * One exception to this rule is: + * If GroupByCols == SortCols and all bucketing columns are part of sorting columns + * (in any order), it is a complete match + */ + ColumnOrderMatch bucketSortColsMatch = matchColumnOrder(bucketCols, sortCols); + ColumnOrderMatch sortGroupByColsMatch = matchColumnOrder(sortCols, groupByCols); + switch (sortGroupByColsMatch) { + case NO_MATCH: return GroupByOptimizerSortMatch.NO_MATCH; + case COMPLETE_MATCH: + return ((bucketSortColsMatch == ColumnOrderMatch.COMPLETE_MATCH) || + (bucketSortColsMatch == ColumnOrderMatch.PREFIX_COL1_MATCH) || + ((bucketCols != null) && !bucketCols.isEmpty() && sortCols.containsAll(bucketCols))) ? + GroupByOptimizerSortMatch.COMPLETE_MATCH : + GroupByOptimizerSortMatch.PARTIAL_MATCH; + case PREFIX_COL1_MATCH: + return GroupByOptimizerSortMatch.NO_MATCH; + case PREFIX_COL2_MATCH: + return ((bucketSortColsMatch == ColumnOrderMatch.NO_MATCH) || + (bucketCols.size() > groupByCols.size())) ? + GroupByOptimizerSortMatch.PARTIAL_MATCH : + GroupByOptimizerSortMatch.COMPLETE_MATCH; } - - int num = sortCols.size() < groupByCols.size() ? sortCols.size() : groupByCols.size(); - for (int i = 0; i < num; i++) { - if (!sortCols.get(i).equals(groupByCols.get(i))) { - return GroupByOptimizerSortMatch.NO_MATCH; - } - } - - return sortCols.size() == groupByCols.size() ? - GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH; + return GroupByOptimizerSortMatch.NO_MATCH; } // Convert the group by to a map-side group by @@ -401,7 +474,7 @@ Object... nodeOutputs) throws SemanticException { // GBY,RS,GBY,RS,GBY... (top to bottom) GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 5); - GroupByOptimizerContext ctx = (GroupByOptimizerContext)procCtx; + GroupByOptimizerContext ctx = (GroupByOptimizerContext) procCtx; if (!checkGroupByOperatorProcessed(ctx, groupByOp)) { processGroupBy(ctx, stack, groupByOp, 4); @@ -424,7 +497,7 @@ } public void setListGroupByOperatorsProcessed( - List listGroupByOperatorsProcessed) { + List listGroupByOperatorsProcessed) { this.listGroupByOperatorsProcessed = listGroupByOperatorsProcessed; }