Index: ql/src/test/results/clientpositive/groupby_sort_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_1.q.out (revision 1405081) +++ ql/src/test/results/clientpositive/groupby_sort_1.q.out (working copy) @@ -31,14 +31,14 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T1 GROUP BY key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -234,12 +234,12 @@ POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] -PREHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +PREHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val PREHOOK: type: QUERY -POSTHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +POSTHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val @@ -274,7 +274,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -935,13 +935,13 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key @@ -1231,7 +1231,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -3344,7 +3344,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -3772,7 +3772,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -4018,13 +4018,13 @@ 7 1 8 2 PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val @@ -4375,13 +4375,13 @@ POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 @@ -5645,7 +5645,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -6076,7 +6076,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: _col0 type: string Index: ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (revision 1405081) +++ ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (working copy) @@ -31,14 +31,14 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T1 GROUP BY key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -234,12 +234,12 @@ POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] -PREHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +PREHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val PREHOOK: type: QUERY -POSTHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +POSTHOOK: query: -- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val @@ -275,7 +275,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -1004,13 +1004,13 @@ POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key @@ -1301,7 +1301,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -3686,7 +3686,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -4183,7 +4183,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -4493,13 +4493,13 @@ 7 1 8 2 PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val @@ -4850,13 +4850,13 @@ POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 PREHOOK: type: QUERY POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 @@ -6121,7 +6121,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: key type: string @@ -6586,7 +6586,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: false + bucketGroup: true keys: expr: _col0 type: string Index: ql/src/test/results/clientpositive/groupby_sort_2.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_2.q.out (working copy) @@ -0,0 +1,168 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE outputTbl1(val string, cnt int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE outputTbl1(val string, cnt int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should not be converted to a map-side group by even though the group by key +-- matches the sorted key. Both the bucketing and sorting keys should match. +-- addind a order by at the end to make the test results deterministic +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should not be converted to a map-side group by even though the group by key +-- matches the sorted key. Both the bucketing and sorting keys should match. +-- addind a order by at the end to make the test results deterministic +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: val + type: string + outputColumnNames: val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM outputTbl1 ORDER BY val +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM outputTbl1 ORDER BY val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.cnt EXPRESSION [(t1)t1.null, ] +POSTHOOK: Lineage: outputtbl1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +11 1 +12 1 +13 1 +17 1 +18 1 +28 1 Index: ql/src/test/queries/clientpositive/groupby_sort_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_1.q (revision 1405081) +++ ql/src/test/queries/clientpositive/groupby_sort_1.q (working copy) @@ -14,7 +14,7 @@ CREATE TABLE outputTbl1(key int, cnt int); -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -27,7 +27,7 @@ CREATE TABLE outputTbl2(key1 int, key2 string, cnt int); --- no map-side group by even if the group by key is a superset of skewed key +-- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val; @@ -60,7 +60,7 @@ CREATE TABLE outputTbl3(key1 int, key2 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; @@ -188,7 +188,7 @@ SELECT * FROM outputTbl1 ORDER BY key; -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; @@ -201,7 +201,7 @@ CREATE TABLE outputTbl5(key1 int, key2 int, key3 string, key4 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; Index: ql/src/test/queries/clientpositive/groupby_sort_2.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_2.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_2.q (working copy) @@ -0,0 +1,26 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (val) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +CREATE TABLE outputTbl1(val string, cnt int); + +-- The plan should not be converted to a map-side group by even though the group by key +-- matches the sorted key. Both the bucketing and sorting keys should match. +-- addind a order by at the end to make the test results deterministic +EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val; + +INSERT OVERWRITE TABLE outputTbl1 +SELECT val, count(1) FROM T1 GROUP BY val; + +SELECT * FROM outputTbl1 ORDER BY val; Index: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (revision 1405081) +++ ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (working copy) @@ -15,7 +15,7 @@ CREATE TABLE outputTbl1(key int, cnt int); -- The plan should be converted to a map-side group by if the group by key --- matches the skewed key +-- matches the sorted key -- addind a order by at the end to make the test results deterministic EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 @@ -28,7 +28,7 @@ CREATE TABLE outputTbl2(key1 int, key2 string, cnt int); --- no map-side group by even if the group by key is a superset of skewed key +-- no map-side group by even if the group by key is a superset of sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val; @@ -61,7 +61,7 @@ CREATE TABLE outputTbl3(key1 int, key2 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant followed --- by a match to the skewed key +-- by a match to the sorted key EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; @@ -189,7 +189,7 @@ SELECT * FROM outputTbl1 ORDER BY key; -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys +-- sorted keys EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; @@ -202,7 +202,7 @@ CREATE TABLE outputTbl5(key1 int, key2 int, key3 string, key4 int, cnt int); -- The plan should be converted to a map-side group by if the group by key contains a constant in between the --- skewed keys followed by anything +-- sorted keys followed by anything EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (revision 1405081) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (working copy) @@ -171,10 +171,8 @@ boolean useMapperSort = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT); - if (useMapperSort) { - if (match == GroupByOptimizerSortMatch.COMPLETE_MATCH) { - convertGroupByMapSideSortedGroupBy(groupByOp, depth); - } + if (useMapperSort && (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { + convertGroupByMapSideSortedGroupBy(groupByOp, depth); } else if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) || (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { @@ -312,7 +310,8 @@ if (!table.isPartitioned()) { List sortCols = Utilities.getColumnNamesFromSortCols(table.getSortCols()); - return matchSortColumns(groupByCols, sortCols); + List bucketCols = table.getBucketCols(); + return matchBucketSortCols(groupByCols, bucketCols, sortCols); } else { PrunedPartitionList partsList = null; try { @@ -333,7 +332,8 @@ GroupByOptimizerSortMatch currentMatch = GroupByOptimizerSortMatch.COMPLETE_MATCH; for (Partition part : partsList.getNotDeniedPartns()) { List sortCols = part.getSortColNames(); - GroupByOptimizerSortMatch match = matchSortColumns(groupByCols, sortCols); + List bucketCols = part.getBucketCols(); + GroupByOptimizerSortMatch match = matchBucketSortCols(groupByCols, bucketCols, sortCols); if (match == GroupByOptimizerSortMatch.NO_MATCH) { return match; } @@ -353,26 +353,44 @@ * in exactly the same order. * * @param groupByCols + * @param bucketCols * @param sortCols * @return * @throws SemanticException */ - private GroupByOptimizerSortMatch matchSortColumns( + private GroupByOptimizerSortMatch matchBucketSortCols( List groupByCols, + List bucketCols, List sortCols) throws SemanticException { if (sortCols == null || sortCols.size() == 0) { return GroupByOptimizerSortMatch.NO_MATCH; } - int num = sortCols.size() < groupByCols.size() ? sortCols.size() : groupByCols.size(); + // map-side group cannot be performed if the bucketing columns don't match sorting columns + boolean bucketSortColsMatch = true; + int numBucketCols = (bucketCols == null) ? 0 : bucketCols.size(); + int numSortCols = sortCols.size(); + if (numBucketCols == numSortCols) { + for (int pos = 0; pos < numBucketCols; pos++) { + if (!sortCols.get(pos).equals(bucketCols.get(pos))) { + bucketSortColsMatch = false; + break; + } + } + } + else { + bucketSortColsMatch = false; + } + + int num = sortCols.size() < groupByCols.size() ? numSortCols : groupByCols.size(); for (int i = 0; i < num; i++) { if (!sortCols.get(i).equals(groupByCols.get(i))) { return GroupByOptimizerSortMatch.NO_MATCH; } } - return sortCols.size() == groupByCols.size() ? + return (sortCols.size() == groupByCols.size()) && bucketSortColsMatch ? GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH; }