Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1428149)
+++ conf/hive-default.xml.template (working copy)
@@ -526,6 +526,22 @@
+ hive.new.job.grouping.set.cardinality
+ 30
+
+ Whether a new map-reduce job should be launched for grouping sets/rollups/cubes.
+ For a query like: select a, b, c, count(1) from T group by a, b, c with rollup;
+ 4 rows are created per row: (a, b, c), (a, b, null), (a, null, null), (null, null, null).
+ This can lead to explosion across map-reduce boundary if the cardinality of T is very high,
+ and map-side aggregation does not do a very good job.
+
+ This parameter decides if hive should add an additional map-reduce job. If the grouping set
+ cardinality (4 in the example above), is more than this value, a new MR job is added under the
+ assumption that the orginal group by will reduce the data size.
+
+
+
+
hive.join.emit.interval
1000
How many rows in the right-most join operand Hive should buffer before emitting the join result.
Index: data/files/grouping_sets1.txt
===================================================================
--- data/files/grouping_sets1.txt (revision 0)
+++ data/files/grouping_sets1.txt (working copy)
@@ -0,0 +1,6 @@
+8 1 1
+5 1 2
+1 1 3
+2 2 4
+2 3 5
+3 2 8
Index: data/files/grouping_sets2.txt
===================================================================
--- data/files/grouping_sets2.txt (revision 0)
+++ data/files/grouping_sets2.txt (working copy)
@@ -0,0 +1,6 @@
+8 1 1
+1 2 2
+1 1 3
+2 2 4
+2 3 5
+2 2 8
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1428149)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -407,6 +407,7 @@
HIVEMULTIGROUPBYSINGLEREDUCER("hive.multigroupby.singlereducer", true),
HIVE_MAP_GROUPBY_SORT("hive.map.groupby.sorted", false),
HIVE_GROUPBY_ORDERBY_POSITION_ALIAS("hive.groupby.orderby.position.alias", false),
+ HIVE_NEW_JOB_GROUPING_SET_CARDINALITY("hive.new.job.grouping.set.cardinality", 30),
// for hive udtf operator
HIVEUDTFAUTOPROGRESS("hive.udtf.auto.progress", false),
Index: ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets4.q.out (working copy)
@@ -0,0 +1,673 @@
+PREHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- This tests that cubes and rollups work fine inside sub-queries.
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This tests that cubes and rollups work fine inside sub-queries.
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) a) (. (TOK_TABLE_OR_COL subq2) a)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-3
+ Stage-3 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+1 NULL 1 1 NULL 1
+1 NULL 1 1 1 1
+1 1 1 1 NULL 1
+1 1 1 1 1 1
+2 NULL 2 2 NULL 2
+2 NULL 2 2 2 1
+2 NULL 2 2 3 1
+2 2 1 2 NULL 2
+2 2 1 2 2 1
+2 2 1 2 3 1
+2 3 1 2 NULL 2
+2 3 1 2 2 1
+2 3 1 2 3 1
+PREHOOK: query: -- Since 4 grouping sets would be generated for each sub-query, an additional MR job should be created
+-- for each of them
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since 4 grouping sets would be generated for each sub-query, an additional MR job should be created
+-- for each of them
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_WHERE (< (TOK_TABLE_OR_COL a) 3)) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) a) (. (TOK_TABLE_OR_COL subq2) a)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-3 depends on stages: Stage-2, Stage-5
+ Stage-4 is a root stage
+ Stage-5 depends on stages: Stage-4
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (a < 3.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+1 NULL 1 1 NULL 1
+1 NULL 1 1 1 1
+1 1 1 1 NULL 1
+1 1 1 1 1 1
+2 NULL 2 2 NULL 2
+2 NULL 2 2 2 1
+2 NULL 2 2 3 1
+2 2 1 2 NULL 2
+2 2 1 2 2 1
+2 2 1 2 3 1
+2 3 1 2 NULL 2
+2 3 1 2 2 1
+2 3 1 2 3 1
Index: ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets3.q.out (working copy)
@@ -0,0 +1,332 @@
+PREHOOK: query: -- In this test, 2 files are loaded into table T1. The data contains rows with the same value of a and b,
+-- with different number of rows for a and b in each file. Since bucketizedHiveInputFormat is used,
+-- this tests that the aggregate function stores the partial aggregate state correctly even if an
+-- additional MR job is created for processing the grouping sets.
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- In this test, 2 files are loaded into table T1. The data contains rows with the same value of a and b,
+-- with different number of rows for a and b in each file. Since bucketizedHiveInputFormat is used,
+-- this tests that the aggregate function stores the partial aggregate state correctly even if an
+-- additional MR job is created for processing the grouping sets.
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets2.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets2.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- The query below will execute in a single MR job, since 4 rows are generated per input row
+-- (cube of a,b will lead to (a,b), (a, null), (null, b) and (null, null) and
+-- hive.new.job.grouping.set.cardinality is more than 4.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The query below will execute in a single MR job, since 4 rows are generated per input row
+-- (cube of a,b will lead to (a,b), (a, null), (null, b) and (null, null) and
+-- hive.new.job.grouping.set.cardinality is more than 4.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: string
+ outputColumnNames: a, b, c
+ Group By Operator
+ aggregations:
+ expr: avg(c)
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: struct
+ expr: _col4
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: avg(VALUE._col0)
+ expr: count(VALUE._col1)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: double
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 3.8333333333333335 12
+NULL 1 2.0 5
+NULL 2 5.2 5
+NULL 3 5.0 2
+1 NULL 2.6666666666666665 3
+1 1 3.0 2
+1 2 2.0 1
+2 NULL 5.2 5
+2 2 5.333333333333333 3
+2 3 5.0 2
+3 NULL 8.0 1
+3 2 8.0 1
+5 NULL 2.0 1
+5 1 2.0 1
+8 NULL 1.0 2
+8 1 1.0 2
+PREHOOK: query: -- The query below will execute in 2 MR jobs, since hive.new.job.grouping.set.cardinality is set to 2.
+-- The partial aggregation state should be maintained correctly across MR jobs.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The query below will execute in 2 MR jobs, since hive.new.job.grouping.set.cardinality is set to 2.
+-- The partial aggregation state should be maintained correctly across MR jobs.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_TABLE_OR_COL c))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: string
+ outputColumnNames: a, b, c
+ Group By Operator
+ aggregations:
+ expr: avg(c)
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: struct
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: avg(VALUE._col0)
+ expr: count(VALUE._col1)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: struct
+ expr: _col4
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: avg(VALUE._col0)
+ expr: count(VALUE._col1)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: double
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, avg(c), count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 3.8333333333333335 12
+NULL 1 2.0 5
+NULL 2 5.2 5
+NULL 3 5.0 2
+1 NULL 2.6666666666666665 3
+1 1 3.0 2
+1 2 2.0 1
+2 NULL 5.2 5
+2 2 5.333333333333333 3
+2 3 5.0 2
+3 NULL 8.0 1
+3 2 8.0 1
+5 NULL 2.0 1
+5 1 2.0 1
+8 NULL 1.0 2
+8 1 1.0 2
Index: ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets2.q.out (working copy)
@@ -0,0 +1,520 @@
+PREHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, count(*) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, count(*) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 6
+NULL 1 2
+NULL 2 3
+NULL 3 1
+1 NULL 1
+1 1 1
+2 NULL 2
+2 2 1
+2 3 1
+3 NULL 1
+3 2 1
+5 NULL 1
+5 2 1
+8 NULL 1
+8 1 1
+PREHOOK: query: EXPLAIN
+SELECT a, b, sum(c) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT a, b, sum(c) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL c)))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: string
+ outputColumnNames: a, b, c
+ Group By Operator
+ aggregations:
+ expr: sum(c)
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: double
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, sum(c) from T1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, sum(c) from T1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 23.0
+NULL 1 4.0
+NULL 2 14.0
+NULL 3 5.0
+1 NULL 3.0
+1 1 3.0
+2 NULL 9.0
+2 2 4.0
+2 3 5.0
+3 NULL 8.0
+3 2 8.0
+5 NULL 2.0
+5 2 2.0
+8 NULL 1.0
+8 1 1.0
+PREHOOK: query: CREATE TABLE T2(a STRING, b STRING, c int, d int)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(a STRING, b STRING, c int, d int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: INSERT OVERWRITE TABLE T2
+SELECT a, b, c, c from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t2
+POSTHOOK: query: INSERT OVERWRITE TABLE T2
+SELECT a, b, c, c from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t2
+POSTHOOK: Lineage: t2.a SIMPLE [(t1)t1.FieldSchema(name:a, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.b SIMPLE [(t1)t1.FieldSchema(name:b, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.c EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.d EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+PREHOOK: query: EXPLAIN
+SELECT a, b, sum(c+d) from T2 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT a, b, sum(c+d) from T2 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t2.a SIMPLE [(t1)t1.FieldSchema(name:a, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.b SIMPLE [(t1)t1.FieldSchema(name:b, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.c EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.d EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION sum (+ (TOK_TABLE_OR_COL c) (TOK_TABLE_OR_COL d))))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ expr: c
+ type: int
+ expr: d
+ type: int
+ outputColumnNames: a, b, c, d
+ Group By Operator
+ aggregations:
+ expr: sum((c + d))
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, sum(c+d) from T2 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, sum(c+d) from T2 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t2.a SIMPLE [(t1)t1.FieldSchema(name:a, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.b SIMPLE [(t1)t1.FieldSchema(name:b, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.c EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.d EXPRESSION [(t1)t1.FieldSchema(name:c, type:string, comment:null), ]
+NULL NULL 46
+NULL 1 8
+NULL 2 28
+NULL 3 10
+1 NULL 6
+1 1 6
+2 NULL 18
+2 2 8
+2 3 10
+3 NULL 16
+3 2 16
+5 NULL 4
+5 2 4
+8 NULL 2
+8 1 2
Index: ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_grouping_sets5.q.out (working copy)
@@ -0,0 +1,433 @@
+PREHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- This tests that cubes and rollups work fine where the source is a sub-query
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This tests that cubes and rollups work fine where the source is a sub-query
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: '0'
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 6
+NULL 1 2
+NULL 2 3
+NULL 3 1
+1 NULL 1
+1 1 1
+2 NULL 2
+2 2 1
+2 3 1
+3 NULL 1
+3 2 1
+5 NULL 1
+5 2 1
+8 NULL 1
+8 1 1
+PREHOOK: query: -- Since 4 grouping sets would be generated for the cube, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since 4 grouping sets would be generated for the cube, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL a)) (TOK_SELEXPR (TOK_TABLE_OR_COL b)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_CUBE_GROUPBY (TOK_TABLE_OR_COL a) (TOK_TABLE_OR_COL b))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-3 depends on stages: Stage-2
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ outputColumnNames: a, b
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: a
+ type: string
+ expr: b
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: '0'
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ sort order: +++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col3
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ expr: KEY._col2
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+NULL NULL 6
+NULL 1 2
+NULL 2 3
+NULL 3 1
+1 NULL 1
+1 1 1
+2 NULL 2
+2 2 1
+2 3 1
+3 NULL 1
+3 2 1
+5 NULL 1
+5 2 1
+8 NULL 1
+8 1 1
Index: ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out
===================================================================
--- ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out (revision 0)
+++ ql/src/test/results/clientnegative/groupby_grouping_sets6.q.out (working copy)
@@ -0,0 +1,6 @@
+PREHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+FAILED: SemanticException [Error 10226]: An additional MR job is introduced since the cardinality of grouping sets is more than hive.new.job.grouping.set.cardinality. This functionality is not supported with distincts. Either set hive.new.job.grouping.set.cardinality to a high number (higher than the number of rows per input row due to grouping sets in the query), or rewrite the query to not use distincts. The number of rows per input row due to grouping sets is 4
Index: ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out
===================================================================
--- ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out (revision 0)
+++ ql/src/test/results/clientnegative/groupby_grouping_sets7.q.out (working copy)
@@ -0,0 +1,6 @@
+PREHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+FAILED: SemanticException [Error 10225]: An additional MR job is introduced since the number of rows created per input row due to grouping sets is more than hive.new.job.grouping.set.cardinality. There is no need to handle skew separately. set hive.groupby.skewindata to false. The number of rows per input row due to grouping sets is 4
Index: ql/src/test/results/compiler/plan/groupby2.q.xml
===================================================================
--- ql/src/test/results/compiler/plan/groupby2.q.xml (revision 1428149)
+++ ql/src/test/results/compiler/plan/groupby2.q.xml (working copy)
@@ -734,7 +734,7 @@
-
+
0.9
@@ -1739,6 +1739,9 @@
+
+
+
0.9
Index: ql/src/test/results/compiler/plan/groupby4.q.xml
===================================================================
--- ql/src/test/results/compiler/plan/groupby4.q.xml (revision 1428149)
+++ ql/src/test/results/compiler/plan/groupby4.q.xml (working copy)
@@ -449,7 +449,7 @@
-
+
0.9
@@ -1197,6 +1197,9 @@
+
+
+
0.9
Index: ql/src/test/results/compiler/plan/groupby6.q.xml
===================================================================
--- ql/src/test/results/compiler/plan/groupby6.q.xml (revision 1428149)
+++ ql/src/test/results/compiler/plan/groupby6.q.xml (working copy)
@@ -449,7 +449,7 @@
-
+
0.9
@@ -1197,6 +1197,9 @@
+
+
+
0.9
Index: ql/src/test/results/compiler/plan/groupby1.q.xml
===================================================================
--- ql/src/test/results/compiler/plan/groupby1.q.xml (revision 1428149)
+++ ql/src/test/results/compiler/plan/groupby1.q.xml (working copy)
@@ -657,7 +657,7 @@
-
+
0.9
@@ -1498,6 +1498,9 @@
+
+
+
0.9
Index: ql/src/test/results/compiler/plan/groupby3.q.xml
===================================================================
--- ql/src/test/results/compiler/plan/groupby3.q.xml (revision 1428149)
+++ ql/src/test/results/compiler/plan/groupby3.q.xml (working copy)
@@ -923,7 +923,7 @@
-
+
0.9
@@ -2058,6 +2058,9 @@
+
+
+
0.9
Index: ql/src/test/results/compiler/plan/groupby5.q.xml
===================================================================
--- ql/src/test/results/compiler/plan/groupby5.q.xml (revision 1428149)
+++ ql/src/test/results/compiler/plan/groupby5.q.xml (working copy)
@@ -504,7 +504,7 @@
-
+
0.9
@@ -1366,6 +1366,9 @@
+
+
+
0.9
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets3.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets3.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets3.q (working copy)
@@ -0,0 +1,27 @@
+-- In this test, 2 files are loaded into table T1. The data contains rows with the same value of a and b,
+-- with different number of rows for a and b in each file. Since bucketizedHiveInputFormat is used,
+-- this tests that the aggregate function stores the partial aggregate state correctly even if an
+-- additional MR job is created for processing the grouping sets.
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets1.txt' INTO TABLE T1;
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets2.txt' INTO TABLE T1;
+
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.new.job.grouping.set.cardinality = 30;
+
+-- The query below will execute in a single MR job, since 4 rows are generated per input row
+-- (cube of a,b will lead to (a,b), (a, null), (null, b) and (null, null) and
+-- hive.new.job.grouping.set.cardinality is more than 4.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+
+set hive.new.job.grouping.set.cardinality=2;
+
+-- The query below will execute in 2 MR jobs, since hive.new.job.grouping.set.cardinality is set to 2.
+-- The partial aggregation state should be maintained correctly across MR jobs.
+EXPLAIN
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+SELECT a, b, avg(c), count(*) from T1 group by a, b with cube;
+
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets4.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets4.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets4.q (working copy)
@@ -0,0 +1,39 @@
+set hive.merge.mapfiles = false;
+set hive.merge.mapredfiles = false;
+-- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1;
+
+-- This tests that cubes and rollups work fine inside sub-queries.
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
+set hive.new.job.grouping.set.cardinality=2;
+
+-- Since 4 grouping sets would be generated for each sub-query, an additional MR job should be created
+-- for each of them
+EXPLAIN
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
+SELECT * FROM
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq1
+join
+(SELECT a, b, count(*) from T1 where a < 3 group by a, b with cube) subq2
+on subq1.a = subq2.a;
+
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets5.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets5.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets5.q (working copy)
@@ -0,0 +1,25 @@
+set hive.merge.mapfiles = false;
+set hive.merge.mapredfiles = false;
+-- Set merging to false above to make the explain more readable
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1;
+
+-- This tests that cubes and rollups work fine where the source is a sub-query
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
+
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
+
+set hive.new.job.grouping.set.cardinality=2;
+
+-- Since 4 grouping sets would be generated for the cube, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
+
+SELECT a, b, count(*) FROM
+(SELECT a, b, count(1) from T1 group by a, b) subq1 group by a, b with cube;
Index: ql/src/test/queries/clientpositive/groupby_grouping_sets2.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_grouping_sets2.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_grouping_sets2.q (working copy)
@@ -0,0 +1,23 @@
+set hive.new.job.grouping.set.cardinality=2;
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/grouping_sets.txt' INTO TABLE T1;
+
+-- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+EXPLAIN
+SELECT a, b, count(*) from T1 group by a, b with cube;
+SELECT a, b, count(*) from T1 group by a, b with cube;
+
+EXPLAIN
+SELECT a, b, sum(c) from T1 group by a, b with cube;
+SELECT a, b, sum(c) from T1 group by a, b with cube;
+
+CREATE TABLE T2(a STRING, b STRING, c int, d int);
+
+INSERT OVERWRITE TABLE T2
+SELECT a, b, c, c from T1;
+
+EXPLAIN
+SELECT a, b, sum(c+d) from T2 group by a, b with cube;
+SELECT a, b, sum(c+d) from T2 group by a, b with cube;
Index: ql/src/test/queries/clientnegative/groupby_grouping_sets6.q
===================================================================
--- ql/src/test/queries/clientnegative/groupby_grouping_sets6.q (revision 0)
+++ ql/src/test/queries/clientnegative/groupby_grouping_sets6.q (working copy)
@@ -0,0 +1,8 @@
+set hive.new.job.grouping.set.cardinality=2;
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+-- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+-- This is not allowed with distincts.
+SELECT a, b, count(distinct c) from T1 group by a, b with cube;
+
Index: ql/src/test/queries/clientnegative/groupby_grouping_sets7.q
===================================================================
--- ql/src/test/queries/clientnegative/groupby_grouping_sets7.q (revision 0)
+++ ql/src/test/queries/clientnegative/groupby_grouping_sets7.q (working copy)
@@ -0,0 +1,10 @@
+set hive.new.job.grouping.set.cardinality=2;
+set hive.map.aggr=true;
+set hive.groupby.skewindata=true;
+
+CREATE TABLE T1(a STRING, b STRING, c STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
+
+-- Since 4 grouping sets would be generated for the query below, an additional MR job should be created
+-- This is not allowed with map-side aggregation and skew
+SELECT a, b, count(1) from T1 group by a, b with cube;
+
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (revision 1428149)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (working copy)
@@ -214,9 +214,6 @@
HiveConf.ConfVars.HIVESENDHEARTBEAT);
countAfterReport = 0;
groupingSetsPresent = conf.isGroupingSetsPresent();
- groupingSets = conf.getListGroupingSets();
- groupingSetsPosition = conf.getGroupingSetPosition();
-
ObjectInspector rowInspector = inputObjInspectors[0];
// init keyFields
@@ -236,6 +233,8 @@
// Initialize the constants for the grouping sets, so that they can be re-used for
// each row
if (groupingSetsPresent) {
+ groupingSets = conf.getListGroupingSets();
+ groupingSetsPosition = conf.getGroupingSetPosition();
newKeysGroupingSets = new ArrayList