Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1423271)
+++ conf/hive-default.xml.template (working copy)
@@ -507,6 +507,22 @@
+ hive.new.job.grouping.set.cardinality
+ 30
+
+ Whether a new map-reduce job should be launched for grouping sets/rollups/cubes.
+ For a query like: select a, b, c, count(1) from T group by a, b, c with rollup;
+ 4 rows are created per row: (a, b, c), (a, b, null), (a, null, null), (null, null, null).
+ This can lead to explosion across map-reduce boundary if the cardinality of T is very high,
+ and map-side aggregation does not do a very good job.
+
+ This parameter decides if hive should add an additional map-reduce job. If the grouping set
+ cardinality (4 in the example above), is more than this value, a new MR job is added under the
+ assumption that the orginal group by will reduce the data size.
+
+
+
+
hive.join.emit.interval
1000
How many rows in the right-most join operand Hive should buffer before emitting the join result.
Index: data/files/grouping_sets1.txt
===================================================================
--- data/files/grouping_sets1.txt (revision 0)
+++ data/files/grouping_sets1.txt (working copy)
@@ -0,0 +1,6 @@
+8 1 1
+5 1 2
+1 1 3
+2 2 4
+2 3 5
+3 2 8
Index: data/files/grouping_sets2.txt
===================================================================
--- data/files/grouping_sets2.txt (revision 0)
+++ data/files/grouping_sets2.txt (working copy)
@@ -0,0 +1,6 @@
+8 1 1
+1 2 2
+1 1 3
+2 2 4
+2 3 5
+2 2 8
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1423271)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -404,6 +404,7 @@
HIVEMAPAGGRHASHMINREDUCTION("hive.map.aggr.hash.min.reduction", (float) 0.5),
HIVEMULTIGROUPBYSINGLEREDUCER("hive.multigroupby.singlereducer", true),
HIVE_MAP_GROUPBY_SORT("hive.map.groupby.sorted", false),
+ HIVE_NEW_JOB_GROUPING_SET_CARDINALITY("hive.new.job.grouping.set.cardinality", 30),
// for hive udtf operator
HIVEUDTFAUTOPROGRESS("hive.udtf.auto.progress", false),
Index: ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out (working copy)
@@ -0,0 +1,673 @@
+PREHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- This tests that cubes and rollups work fine inside sub-queries.
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This tests that cubes and rollups work fine inside sub-queries.
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) a) (. (TOK_TABLE_OR_COL subq2) a)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-3
+ Stage-3 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+1 NULL 1 1 NULL 1
+1 NULL 1 1 1 1
+1 1 1 1 NULL 1
+1 1 1 1 1 1
+2 NULL 2 2 NULL 2
+2 NULL 2 2 2 1
+2 NULL 2 2 3 1
+2 2 1 2 NULL 2
+2 2 1 2 2 1
+2 2 1 2 3 1
+2 3 1 2 NULL 2
+2 3 1 2 2 1
+2 3 1 2 3 1
+PREHOOK: query: -- Since 4 grouping sets would be generated for each sub-query, an additional MR job should be created
+-- for each of them
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since 4 grouping sets would be generated for each sub-query, an additional MR job should be created
+-- for each of them
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) a) (. (TOK_TABLE_OR_COL subq2) a)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-3 depends on stages: Stage-2, Stage-5
+ Stage-4 is a root stage
+ Stage-5 depends on stages: Stage-4
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+1 NULL 1 1 NULL 1
+1 NULL 1 1 1 1
+1 1 1 1 NULL 1
+1 1 1 1 1 1
+2 NULL 2 2 NULL 2
+2 NULL 2 2 2 1
+2 NULL 2 2 3 1
+2 2 1 2 NULL 2
+2 2 1 2 2 1
+2 2 1 2 3 1
+2 3 1 2 NULL 2
+2 3 1 2 2 1
+2 3 1 2 3 1
Index: ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out (working copy)
@@ -0,0 +1,332 @@
+PREHOOK: query: -- In this test, 2 files are loaded into table T1. The data contains rows with the same value of a and b,
+-- with different number of rows for a and b in each file. Since bucketizedHiveInputFormat is used,
+-- this tests that the aggregate function stores the partial aggregate state correctly even if an
+-- additional MR job is created for processing the grouping sets.
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- In this test, 2 files are loaded into table T1. The data contains rows with the same value of a and b,
+-- with different number of rows for a and b in each file. Since bucketizedHiveInputFormat is used,
+-- this tests that the aggregate function stores the partial aggregate state correctly even if an
+-- additional MR job is created for processing the grouping sets.
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets2.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets2.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- The query below will execute in a single MR job, since 4 rows are generated per input row
+-- (cube of a,b will lead to (a,b), (a, null), (null, b) and (null, null) and the
+-- default value of hive.new.job.grouping.set.cardinality is more than 4.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The query below will execute in a single MR job, since 4 rows are generated per input row
+-- (cube of a,b will lead to (a,b), (a, null), (null, b) and (null, null) and the
+-- default value of hive.new.job.grouping.set.cardinality is more than 4.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: string
+ outputColumnNames: a, b, c
+ Group By Operator
+ aggregations:
+ expr: avg(c)
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: struct
+ expr: _col4
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: avg(VALUE._col0)
+ expr: count(VALUE._col1)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: double
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 3.8333333333333335 12
+NULL 1 2.0 5
+NULL 2 5.2 5
+NULL 3 5.0 2
+1 NULL 2.6666666666666665 3
+1 1 3.0 2
+1 2 2.0 1
+2 NULL 5.2 5
+2 2 5.333333333333333 3
+2 3 5.0 2
+3 NULL 8.0 1
+3 2 8.0 1
+5 NULL 2.0 1
+5 1 2.0 1
+8 NULL 1.0 2
+8 1 1.0 2
+PREHOOK: query: -- The query below will execute in 2 MR jobs, since hive.new.job.grouping.set.cardinality is set to 2.
+-- The partial aggregation state should be maintained correctly across MR jobs.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The query below will execute in 2 MR jobs, since hive.new.job.grouping.set.cardinality is set to 2.
+-- The partial aggregation state should be maintained correctly across MR jobs.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: string
+ outputColumnNames: a, b, c
+ Group By Operator
+ aggregations:
+ expr: avg(c)
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: struct
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: avg(VALUE._col0)
+ expr: count(VALUE._col1)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: struct
+ expr: _col4
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: avg(VALUE._col0)
+ expr: count(VALUE._col1)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: double
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 3.8333333333333335 12
+NULL 1 2.0 5
+NULL 2 5.2 5
+NULL 3 5.0 2
+1 NULL 2.6666666666666665 3
+1 1 3.0 2
+1 2 2.0 1
+2 NULL 5.2 5
+2 2 5.333333333333333 3
+2 3 5.0 2
+3 NULL 8.0 1
+3 2 8.0 1
+5 NULL 2.0 1
+5 1 2.0 1
+8 NULL 1.0 2
+8 1 1.0 2
Index: ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out (working copy)
@@ -0,0 +1,520 @@
+PREHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 6
+NULL 1 2
+NULL 2 3
+NULL 3 1
+1 NULL 1
+1 1 1
+2 NULL 2
+2 2 1
+2 3 1
+3 NULL 1
+3 2 1
+5 NULL 1
+5 2 1
+8 NULL 1
+8 1 1
+PREHOOK: query: EXPLAIN
+SELECT a, b, sum(c) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT a, b, sum(c) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL c)))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: string
+ outputColumnNames: a, b, c
+ Group By Operator
+ aggregations:
+ expr: sum(c)
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: double
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, sum(c) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, sum(c) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 23.0
+NULL 1 4.0
+NULL 2 14.0
+NULL 3 5.0
+1 NULL 3.0
+1 1 3.0
+2 NULL 9.0
+2 2 4.0
+2 3 5.0
+3 NULL 8.0
+3 2 8.0
+5 NULL 2.0
+5 2 2.0
+8 NULL 1.0
+8 1 1.0
+PREHOOK: query: CREATE TABLE T2(a STRING, b STRING, c int, d int)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(a STRING, b STRING, c int, d int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: INSERT OVERWRITE TABLE T2
+SELECT a, b, c, c from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t2
+POSTHOOK: query: INSERT OVERWRITE TABLE T2
+SELECT a, b, c, c from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t2
+POSTHOOK: Lineage: t2.a SIMPLE [(t1)t1.FieldSchema(name:a, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.b SIMPLE [(t1)t1.FieldSchema(name:b, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.c EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.d EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+PREHOOK: query: EXPLAIN
+SELECT a, b, sum(c+d) from T2 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT a, b, sum(c+d) from T2 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t2.a SIMPLE [(t1)t1.FieldSchema(name:a, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.b SIMPLE [(t1)t1.FieldSchema(name:b, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.c EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.d EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION sum (+ (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: int
+ expr: d
+ type: int
+ outputColumnNames: a, b, c, d
+ Group By Operator
+ aggregations:
+ expr: sum((c + d))
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, sum(c+d) from T2 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, sum(c+d) from T2 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t2.a SIMPLE [(t1)t1.FieldSchema(name:a, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.b SIMPLE [(t1)t1.FieldSchema(name:b, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.c EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.d EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+NULL NULL 46
+NULL 1 8
+NULL 2 28
+NULL 3 10
+1 NULL 6
+1 1 6
+2 NULL 18
+2 2 8
+2 3 10
+3 NULL 16
+3 2 16
+5 NULL 4
+5 2 4
+8 NULL 2
+8 1 2
Index: ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out (working copy)
@@ -0,0 +1,433 @@
+PREHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- This tests that cubes and rollups work fine where the source is a sub-query
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This tests that cubes and rollups work fine where the source is a sub-query
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 6
+NULL 1 2
+NULL 2 3
+NULL 3 1
+1 NULL 1
+1 1 1
+2 NULL 2
+2 2 1
+2 3 1
+3 NULL 1
+3 2 1
+5 NULL 1
+5 2 1
+8 NULL 1
+8 1 1
+PREHOOK: query: -- Since 4 grouping sets would be generated for the cube, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since 4 grouping sets would be generated for the cube, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-3 depends on stages: Stage-2
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 6
+NULL 1 2
+NULL 2 3
+NULL 3 1
+1 NULL 1
+1 1 1
+2 NULL 2
+2 2 1
+2 3 1
+3 NULL 1
+3 2 1
+5 NULL 1
+5 2 1
+8 NULL 1
+8 1 1
Index: ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out
===================================================================
--- ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out (revision 0)
+++ ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out (working copy)
@@ -0,0 +1,6 @@
+PREHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+FAILED: SemanticException [Error 10216]: An additional MR job is introduced since the cardinality of grouping sets is more than hive.new.job.grouping.set.cardinality. This functionality is not supported with distincts. Either set hive.new.job.grouping.set.cardinality to a high number (higher than the number of grouping set keys in the query), or rewrite the query to not use distincts.
Index: ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out
===================================================================
--- ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out (revision 0)
+++ ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out (working copy)
@@ -0,0 +1,6 @@
+PREHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+FAILED: SemanticException [Error 10215]: An additional MR job is introduced since the cardinality of grouping sets is more than hive.new.job.grouping.set.cardinality. There is no need to handle skew separately. set hive.groupby.skewindata to false.
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets3.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets3.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets3.q (working copy)
@@ -0,0 +1,26 @@
+-- In this test, 2 files are loaded into table T1. The data contains rows with the same value of a and b,
+-- with different number of rows for a and b in each file. Since bucketizedHiveInputFormat is used,
+-- this tests that the aggregate function stores the partial aggregate state correctly even if an
+-- additional MR job is created for processing the grouping sets.
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets1.txt' INTO TABLE T1;
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets2.txt' INTO TABLE T1;
+
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+
+-- The query below will execute in a single MR job, since 4 rows are generated per input row
+-- (cube of a,b will lead to (a,b), (a, null), (null, b) and (null, null) and the
+-- default value of hive.new.job.grouping.set.cardinality is more than 4.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+
+set hive.new.job.grouping.set.cardinality=2;
+
+-- The query below will execute in 2 MR jobs, since hive.new.job.grouping.set.cardinality is set to 2.
+-- The partial aggregation state should be maintained correctly across MR jobs.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets4.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets4.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets4.q (working copy)
@@ -0,0 +1,39 @@
+set hive.merge.mapfiles = false;
+set hive.merge.mapredfiles = false;
+-- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1;
+
+-- This tests that cubes and rollups work fine inside sub-queries.
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
+set hive.new.job.grouping.set.cardinality=2;
+
+-- Since 4 grouping sets would be generated for each sub-query, an additional MR job should be created
+-- for each of them
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets5.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets5.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets5.q (working copy)
@@ -0,0 +1,25 @@
+set hive.merge.mapfiles = false;
+set hive.merge.mapredfiles = false;
+-- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1;
+
+-- This tests that cubes and rollups work fine where the source is a sub-query
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
+
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
+
+set hive.new.job.grouping.set.cardinality=2;
+
+-- Since 4 grouping sets would be generated for the cube, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
+
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets2.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets2.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets2.q (working copy)
@@ -0,0 +1,23 @@
+set hive.new.job.grouping.set.cardinality=2;
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1;
+
+-- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) from T1 group by a, b with cube;
+SELECT a, b, count(*) from T1 group by a, b with cube;
+
+EXPLAIN
+SELECT a, b, sum(c) from T1 group by a, b with cube;
+SELECT a, b, sum(c) from T1 group by a, b with cube;
+
+CREATE TABLE T2(a STRING, b STRING, c int, d int);
+
+INSERT OVERWRITE TABLE T2
+SELECT a, b, c, c from T1;
+
+EXPLAIN
+SELECT a, b, sum(c+d) from T2 group by a, b with cube;
+SELECT a, b, sum(c+d) from T2 group by a, b with cube;
Index: ql/src/test/queries/clientnegative/groupby_grouping_sets6.q
===================================================================
--- ql/src/test/queries/clientnegative/groupby_grouping_sets6.q (revision 0)
+++ ql/src/test/queries/clientnegative/groupby_grouping_sets6.q (working copy)
@@ -0,0 +1,8 @@
+set hive.new.job.grouping.set.cardinality=2;
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+-- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+-- This is not allowed with distincts.
+SELECT a, b, count(distinct c) from T1 group by a, b with cube;
+
Index: ql/src/test/queries/clientnegative/groupby_grouping_sets7.q
===================================================================
--- ql/src/test/queries/clientnegative/groupby_grouping_sets7.q (revision 0)
+++ ql/src/test/queries/clientnegative/groupby_grouping_sets7.q (working copy)
@@ -0,0 +1,10 @@
+set hive.new.job.grouping.set.cardinality=2;
+set hive.map.aggr=true;
+set hive.groupby.skewindata=true;
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+-- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+-- This is not allowed with map-side aggregation and skew
+SELECT a, b, count(1) from T1 group by a, b with cube;
+
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (revision 1423271)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (working copy)
@@ -214,9 +214,6 @@
HiveConf.ConfVars.HIVESENDHEARTBEAT);
countAfterReport = 0;
groupingSetsPresent = conf.isGroupingSetsPresent();
- groupingSets = conf.getListGroupingSets();
- groupingSetsPosition = conf.getGroupingSetPosition();
-
ObjectInspector rowInspector = inputObjInspectors[0];
// init keyFields
@@ -236,6 +233,8 @@
// Initialize the constants for the grouping sets, so that they can be re-used for
// each row
if (groupingSetsPresent) {
+ groupingSets = conf.getListGroupingSets();
+ groupingSetsPosition = conf.getGroupingSetPosition();
newKeysGroupingSets = new ArrayList