Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1384204) +++ conf/hive-default.xml.template (working copy) @@ -432,7 +432,17 @@ job plan. If the multi group by query has common group by keys, it will be optimized to generate single M/R job. + + hive.map.groupby.sorted + false + If the bucketing/sorting properties of the table match the grouping key, whether to + perform the group by in the mapper by using BucketizedHiveInputFormat. The only downside to this + is that it limits the number of mappers to the number of files. + + + + hive.join.emit.interval 1000 How many rows in the right-most join operand Hive should buffer before emitting the join result. Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1384204) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -385,6 +385,7 @@ HIVEMAPAGGRMEMORYTHRESHOLD("hive.map.aggr.hash.force.flush.memory.threshold", (float) 0.9), HIVEMAPAGGRHASHMINREDUCTION("hive.map.aggr.hash.min.reduction", (float) 0.5), HIVEMULTIGROUPBYSINGLEREDUCER("hive.multigroupby.singlereducer", true), + HIVE_MAP_GROUPBY_SORT("hive.map.groupby.sorted", false), // for hive udtf operator HIVEUDTFAUTOPROGRESS("hive.udtf.auto.progress", false), Index: ql/src/test/results/clientpositive/groupby_sort_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_1.q.out (working copy) @@ -0,0 +1,3186 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by even if the group by +-- key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by even if the group by +-- key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: 1 + type: int + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 1 +1 2 1 +1 3 1 +1 7 1 +1 8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: (key + 1) + type: double + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:double:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2.0 11 1 +2 3.0 12 1 +3 4.0 13 1 +7 8.0 17 1 +8 9.0 18 1 +8 9.0 28 1 +PREHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +PREHOOK: type: QUERY +POSTHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col2 + Group By Operator + aggregations: + expr: sum(_col2) + bucketGroup: false + keys: + expr: (_col0 + _col0) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: _col0 + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +2.0 1 +4.0 1 +6.0 1 +14.0 1 +16.0 2 +PREHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +1 11 1 +2 12 1 +2 12 1 +3 13 1 +3 13 1 +7 17 1 +7 17 1 +8 18 1 +8 18 1 +8 28 1 +8 28 1 +PREHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-2 depends on stages: Stage-3 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: (key + key) + type: double + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: double + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1.0 11 1 +2.0 12 1 +3.0 13 1 +7.0 17 1 +8.0 18 1 +8.0 28 1 +2.0 11 1 +4.0 12 1 +6.0 13 1 +14.0 17 1 +16.0 18 1 +16.0 28 1 +PREHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4,_col5 + columns.types string:string:bigint:string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 1 11 1 +2 12 1 2 12 1 +3 13 1 3 13 1 +7 17 1 7 17 1 +8 18 1 8 18 1 +8 18 1 8 28 1 +8 28 1 8 18 1 +8 28 1 8 28 1 +PREHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL val) (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: val + type: string + expr: key + type: string + outputColumnNames: val, key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: val + type: string + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4,_col5 + columns.types string:string:bigint:string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t2 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + expr: 2 + type: int + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: int + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:int:string:int:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 2 1 +2 1 12 2 1 +3 1 13 2 1 +7 1 17 2 1 +8 1 18 2 1 +8 1 28 2 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 2)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) (+ (TOK_TABLE_OR_COL key) 2)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + expr: (key + 2) + type: double + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:int:string:double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 3.0 1 +2 1 12 4.0 1 +3 1 13 5.0 1 +7 1 17 9.0 1 +8 1 18 10.0 1 +8 1 28 10.0 1 +PREHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys +EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys +EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val) 2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: (key + 1) + type: double + expr: val + type: string + expr: 2 + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + sort order: ++++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + tag: -1 + value expressions: + expr: _col4 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: double + expr: KEY._col2 + type: string + expr: KEY._col3 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:double:string:int:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2.0 11 2 1 +2 3.0 12 2 1 +3 4.0 13 2 1 +7 8.0 17 2 1 +8 9.0 18 2 1 +8 9.0 28 2 1 Index: ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (working copy) @@ -0,0 +1,3521 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by even if the group by +-- key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by even if the group by +-- key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: 1 + type: int + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 1 +1 2 1 +1 3 1 +1 7 1 +1 8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: (key + 1) + type: double + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:double:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2.0 11 1 +2 3.0 12 1 +3 4.0 13 1 +7 8.0 17 1 +8 9.0 18 1 +8 9.0 28 1 +PREHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +PREHOOK: type: QUERY +POSTHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col2 + Group By Operator + aggregations: + expr: sum(_col2) + bucketGroup: false + keys: + expr: (_col0 + _col0) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: _col0 + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +2.0 1 +4.0 1 +6.0 1 +14.0 1 +16.0 2 +PREHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +1 11 1 +2 12 1 +2 12 1 +3 13 1 +3 13 1 +7 17 1 +7 17 1 +8 18 1 +8 18 1 +8 28 1 +8 28 1 +PREHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-4 depends on stages: Stage-3 + Stage-2 depends on stages: Stage-4 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: (key + key) + type: double + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + expr: KEY._col1 + type: string + mode: partials + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: double + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: double + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + expr: KEY._col1 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10003 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double,string,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1.0 11 1 +2.0 12 1 +3.0 13 1 +7.0 17 1 +8.0 18 1 +8.0 28 1 +2.0 11 1 +4.0 12 1 +6.0 13 1 +14.0 17 1 +16.0 18 1 +16.0 28 1 +PREHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4,_col5 + columns.types string:string:bigint:string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 1 11 1 +2 12 1 2 12 1 +3 13 1 3 13 1 +7 17 1 7 17 1 +8 18 1 8 18 1 +8 18 1 8 28 1 +8 28 1 8 18 1 +8 28 1 8 28 1 +PREHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL val) (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-3 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: val + type: string + expr: key + type: string + outputColumnNames: val, key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: val + type: string + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: partials + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10003 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + expr: _col3 + type: string + expr: _col4 + type: string + expr: _col5 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4,_col5 + columns.types string:string:bigint:string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t2 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: true + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + expr: 2 + type: int + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: int + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:int:string:int:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 2 1 +2 1 12 2 1 +3 1 13 2 1 +7 1 17 2 1 +8 1 18 2 1 +8 1 28 2 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 2)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) (+ (TOK_TABLE_OR_COL key) 2)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + expr: (key + 2) + type: double + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: double + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:int:string:double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 3.0 1 +2 1 12 4.0 1 +3 1 13 5.0 1 +7 1 17 9.0 1 +8 1 18 10.0 1 +8 1 28 10.0 1 +PREHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys +EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys +EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val) 2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: (key + 1) + type: double + expr: val + type: string + expr: 2 + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + sort order: ++++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col4 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: double + expr: KEY._col2 + type: string + expr: KEY._col3 + type: int + mode: partials + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string,double,string,int,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + sort order: ++++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + tag: -1 + value expressions: + expr: _col4 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string,double,string,int,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string,double,string,int,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: double + expr: KEY._col2 + type: string + expr: KEY._col3 + type: int + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: string + expr: _col3 + type: int + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:double:string:int:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2.0 11 2 1 +2 3.0 12 2 1 +3 4.0 13 2 1 +7 8.0 17 2 1 +8 9.0 18 2 1 +8 9.0 28 2 1 Index: ql/src/test/queries/clientpositive/groupby_sort_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_1.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_1.q (working copy) @@ -0,0 +1,138 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +-- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key; +SELECT key, count(1) FROM T1 GROUP BY key; + +-- The plan should be converted to a map-side group by even if the group by +-- key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val; +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +-- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; +SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; + +-- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; +SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; + +-- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; +SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; + +-- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; + +-- The plan should be converted to a map-side group by if the group by key contains a function followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val; +SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val; + +-- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key; + +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key; + +-- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1; + +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1; + +-- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1; + +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1; + +-- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key; + +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key; + +-- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2 +ON subq1.key = subq2.key; + + +CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1; + +-- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key; +SELECT key, count(1) FROM T2 GROUP BY key; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; +SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2; +SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2; + +-- no map-side group by if the group by key contains a function in between the skewed keys +EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2; +SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2; Property changes on: ql/src/test/queries/clientpositive/groupby_sort_1.q ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (working copy) @@ -0,0 +1,139 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; +set hive.groupby.skewindata=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +-- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key; +SELECT key, count(1) FROM T1 GROUP BY key; + +-- The plan should be converted to a map-side group by even if the group by +-- key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val; +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +-- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; +SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; + +-- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; +SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; + +-- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; +SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; + +-- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; + +-- The plan should be converted to a map-side group by if the group by key contains a function followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val; +SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val; + +-- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key; + +SELECT key + key, sum(cnt) from +(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1 +group by key + key; + +-- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1; + +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key, val, count(1) FROM T1 GROUP BY key, val +) subq1; + +-- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1; + +SELECT * FROM ( +SELECT key, val, count(1) FROM T1 GROUP BY key, val + UNION ALL +SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val +) subq1; + +-- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key; + +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key; + +-- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1 +JOIN +(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2 +ON subq1.key = subq2.key; + + +CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1; + +-- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key; +SELECT key, count(1) FROM T2 GROUP BY key; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; +SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2; +SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2; + +-- no map-side group by if the group by key contains a function in between the skewed keys +EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2; +SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2; Property changes on: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupBySortOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupBySortOptimizer.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupBySortOptimizer.java (working copy) @@ -0,0 +1,395 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.util.StringUtils; + +/** + * This transformation does group by optimization. If the grouping key is a superset + * of the bucketing and sorting keys of the underlying table in the same order, the + * group by can be be performed on the map-side completely. + */ +public class GroupBySortOptimizer implements Transform { + + private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class + .getName()); + + public GroupBySortOptimizer() { + } + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + Map opRules = new LinkedHashMap(); + HiveConf conf = pctx.getConf(); + + if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + // process group-by pattern + opRules.put(new RuleRegExp("R1", + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%"), + getMapSortedGroupbyProc(pctx)); + } else { + // If hive.groupby.skewindata is set to true, the operator tree is as below + opRules.put(new RuleRegExp("R2", + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%"), + getMapSortedGroupbySkewProc(pctx)); + } + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = + new DefaultRuleDispatcher(getDefaultProc(), opRules, new GroupBySortOptimizerContext()); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + return pctx; + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + private NodeProcessor getMapSortedGroupbyProc(ParseContext pctx) { + return new SortGroupByProcessor(pctx); + } + + private NodeProcessor getMapSortedGroupbySkewProc(ParseContext pctx) { + return new SortGroupBySkewProcessor(pctx); + } + + /** + * SortGroupByProcessor. + * + */ + public class SortGroupByProcessor implements NodeProcessor { + + protected ParseContext pGraphContext; + + public SortGroupByProcessor(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + + // Check if the group by operator has already been processed + protected boolean checkGroupByOperatorProcessed( + GroupBySortOptimizerContext groupBySortOptimizerContext, + GroupByOperator groupByOp) { + + // The group by operator has already been processed + if (groupBySortOptimizerContext.getListGroupByOperatorsProcessed().contains(groupByOp)) { + return false; + } + + groupBySortOptimizerContext.getListGroupByOperatorsProcessed().add(groupByOp); + return true; + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + // GBY,RS,GBY... (top to bottom) + GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 3); + + if (checkGroupByOperatorProcessed((GroupBySortOptimizerContext)procCtx, groupByOp) && + (checkSortGroupBy(groupByOp))) { + convertGroupByMapSideSortedGroupBy(groupByOp, 2); + } + return null; + } + + // Should this group by be converted to a map-side group by, because the grouping keys for + // the base table for the group by matches the skewed keys + protected boolean checkSortGroupBy(GroupByOperator groupByOp) + throws SemanticException { + + // if this is not a HASH groupby, return + if (groupByOp.getConf().getMode() != GroupByDesc.Mode.HASH) { + return false; + } + + // Check all the operators in the stack. Currently, only SELECTs and FILTERs + // are allowed. A interface 'supportMapSideGroupBy has been added for the same + Operator currOp = groupByOp; + currOp = currOp.getParentOperators().get(0); + + while (true) { + if (currOp.getParentOperators() == null) { + break; + } + + if ((currOp.getParentOperators().size() > 1) || + (!currOp.supportMapSideGroupBy())) { + return false; + } + + currOp = currOp.getParentOperators().get(0); + } + + // currOp now points to the top-most tablescan operator + TableScanOperator tableScanOp = (TableScanOperator)currOp; + + // Create a mapping from the group by columns to the table columns + Map tableColsMapping = new HashMap(); + Table table = pGraphContext.getTopToTable().get(currOp); + for (FieldSchema col : table.getAllCols()) { + tableColsMapping.put(col.getName(), col.getName()); + } + + while (currOp != groupByOp) { + Operator processOp = currOp; + currOp = currOp.getChildOperators().get(0); + + // Filters don't change the column names - so, no need to do anything for them + if (processOp instanceof SelectOperator) { + SelectOperator selectOp = (SelectOperator)processOp; + SelectDesc selectDesc = selectOp.getConf(); + + if (selectDesc.isSelStarNoCompute()) { + continue; + } + + // Only columns and constants can be selected + for (int pos = 0; pos < selectDesc.getColList().size(); pos++) { + ExprNodeDesc selectColList = selectDesc.getColList().get(pos); + if ((selectColList instanceof ExprNodeConstantDesc) || + (selectColList instanceof ExprNodeNullDesc)) { + continue; + } + + if (selectColList instanceof ExprNodeColumnDesc) { + String newValue = + tableColsMapping.get(((ExprNodeColumnDesc) selectColList).getColumn()); + tableColsMapping.put(selectDesc.getOutputColumnNames().get(pos), newValue); + } else { + return false; + } + } + } + } + + boolean sortGroupBy = true; + // compute groupby columns from groupby keys + List groupByCols = new ArrayList(); + // If the group by expression is anything other than a list of columns, + // the sorting property is not obeyed + for (ExprNodeDesc expr : groupByOp.getConf().getKeys()) { + if (expr instanceof ExprNodeColumnDesc) { + groupByCols.add(tableColsMapping.get(((ExprNodeColumnDesc) expr).getColumn())); + } + // Constants and nulls are OK + else if ((expr instanceof ExprNodeConstantDesc) || + (expr instanceof ExprNodeNullDesc)) { + continue; + } else { + break; + } + } + + if (groupByCols.size() == 0) { + return false; + } + + if (!table.isPartitioned()) { + List sortCols = Utilities.getColumnNamesFromSortCols(table.getSortCols()); + if (!matchSortColumns(groupByCols, sortCols)) { + return false; + } + } else { + PrunedPartitionList partsList = null; + try { + partsList = pGraphContext.getOpToPartList().get(tableScanOp); + if (partsList == null) { + partsList = PartitionPruner.prune(table, + pGraphContext.getOpToPartPruner().get(tableScanOp), + pGraphContext.getConf(), + table.getTableName(), + pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartList().put(tableScanOp, partsList); + } + } catch (HiveException e) { + LOG.error(StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } + + for (Partition part : partsList.getNotDeniedPartns()) { + List sortCols = part.getSortColNames(); + if (!matchSortColumns(groupByCols, sortCols)) { + return false; + } + } + } + + return true; + } + + /** + * Given the group by keys, sort columns, this method + * determines if we can use sorted group by or not. + * We can use map-side sort group by + * if the bucketing and sorting columns are a prefix of groupby columns. + * + * @param groupByCols + * @param sortCols + * @return + * @throws SemanticException + */ + private boolean matchSortColumns( + List groupByCols, + List sortCols) throws SemanticException { + + if (sortCols == null || sortCols.size() == 0) { + return false; + } + + if (sortCols.size() <= groupByCols.size()) { + int num = sortCols.size(); + for (int i = 0; i < num; i++) { + if (!sortCols.get(i).equals(groupByCols.get(i))) { + return false; + } + } + + return true; + } + + return false; + } + + // Convert the group by to a map-side group by + // The operators specified by depth and removed from the tree. + protected void convertGroupByMapSideSortedGroupBy(GroupByOperator groupByOp, int depth) { + Operator currOp = groupByOp; + for (int i = 0; i < depth; i++) { + if (currOp.getChildOperators().size() > 1) { + return; + } + currOp = currOp.getChildOperators().get(0); + } + + groupByOp.setChildOperators(currOp.getChildOperators()); + + List> parentOps = + new ArrayList>(); + parentOps.add(groupByOp); + + for (Operator op : currOp.getChildOperators()) { + op.setParentOperators(parentOps); + } + + // Use bucketized hive input format - that makes sure that one mapper reads the entire file + groupByOp.setUseBucketizedHiveInputFormat(true); + groupByOp.getConf().setMode(GroupByDesc.Mode.FINAL); + } + } + + /** + * SortGroupByProcessor. + * + */ + public class SortGroupBySkewProcessor extends SortGroupByProcessor { + public SortGroupBySkewProcessor(ParseContext pGraphContext) { + super(pGraphContext); + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + // GBY,RS,GBY,RS,GBY... (top to bottom) + GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 5); + + if (checkGroupByOperatorProcessed((GroupBySortOptimizerContext)procCtx, groupByOp) && + (checkSortGroupBy(groupByOp))) { + convertGroupByMapSideSortedGroupBy(groupByOp, 4); + } + return null; + } + } + + public class GroupBySortOptimizerContext implements NodeProcessorCtx { + List listGroupByOperatorsProcessed; + + public GroupBySortOptimizerContext() { + listGroupByOperatorsProcessed = new ArrayList(); + } + + public List getListGroupByOperatorsProcessed() { + return listGroupByOperatorsProcessed; + } + + public void setListGroupByOperatorsProcessed( + List listGroupByOperatorsProcessed) { + this.listGroupByOperatorsProcessed = listGroupByOperatorsProcessed; + } + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (working copy) @@ -285,8 +285,8 @@ bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias()); bucketMJCxt.setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class); bucketMJCxt.setBigTablePartSpecToFileMapping( - currMapJoinOp.getConf().getBigTablePartSpecToFileMapping()); - plan.setSmbJoin(currMapJoinOp instanceof SMBMapJoinOperator); + currMapJoinOp.getConf().getBigTablePartSpecToFileMapping()); + plan.setUseBucketizedHiveInputFormat(currMapJoinOp instanceof SMBMapJoinOperator); } } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (working copy) @@ -183,9 +183,9 @@ table); if (topOp == null || (!(topOp instanceof TableScanOperator))) { // this is in a sub-query. - // In future, we need to infer subq's columns propery. For example + // In future, we need to infer subq's columns property. For example // "select key, count(1) - // from (from clustergroupbyselect key, value where ds='210') group by key, 3;", + // from (from clustergroupby select key, value where ds='210') group by key, 3;", // even though the group by op is in a subquery, it can be changed to // bucket groupby. return; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -61,6 +61,9 @@ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGROUPBY)) { transformations.add(new GroupByOptimizer()); } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT)) { + transformations.add(new GroupBySortOptimizer()); + } transformations.add(new SamplePruner()); transformations.add(new MapJoinProcessor()); if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (working copy) @@ -321,7 +321,7 @@ inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName(); } - if (getWork().isSmbJoin()) { + if (getWork().isUseBucketizedHiveInputFormat()) { inpFormat = BucketizedHiveInputFormat.class.getName(); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy) @@ -160,4 +160,9 @@ public OperatorType getType() { return OperatorType.FILTER; } + + @Override + public boolean supportMapSideGroupBy() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy) @@ -100,4 +100,9 @@ public OperatorType getType() { return OperatorType.SELECT; } + + @Override + public boolean supportMapSideGroupBy() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -103,6 +103,8 @@ seqId = 0; } + private boolean useBucketizedHiveInputFormat; + public Operator() { id = String.valueOf(seqId++); } @@ -1359,4 +1361,16 @@ return ret; } + + public boolean supportMapSideGroupBy() { + return false; + } + + public boolean isUseBucketizedHiveInputFormat() { + return useBucketizedHiveInputFormat; + } + + public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) { + this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (working copy) @@ -89,7 +89,7 @@ // used to indicate the input is sorted, and so a BinarySearchRecordReader shoudl be used private boolean inputFormatSorted = false; - private transient boolean smbJoin; + private transient boolean useBucketizedHiveInputFormat; public MapredWork() { aliasToPartnInfo = new LinkedHashMap(); @@ -488,11 +488,11 @@ return returnList; } - public boolean isSmbJoin() { - return smbJoin; + public boolean isUseBucketizedHiveInputFormat() { + return useBucketizedHiveInputFormat; } - public void setSmbJoin(boolean smbJoin) { - this.smbJoin = smbJoin; + public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) { + this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat; } } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1384204) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -70,8 +70,8 @@ import org.apache.hadoop.hive.ql.exec.RecordWriter; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.StatsTask; -import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; @@ -7203,6 +7203,12 @@ setKeyDescTaskTree(rootTask); } + // If a task contains an operator which instructs bucketizedhiveinputformat + // to be used, please do so + for (Task rootTask : rootTasks) { + setInputFormat(rootTask); + } + PhysicalContext physicalContext = new PhysicalContext(conf, getParseContext(), ctx, rootTasks, fetchTask); PhysicalOptimizer physicalOptimizer = new PhysicalOptimizer( @@ -7383,7 +7389,44 @@ } } + private void setInputFormat(MapredWork work, Operator op) { + if (op.isUseBucketizedHiveInputFormat()) { + work.setUseBucketizedHiveInputFormat(true); + return; + } + + if (op.getChildOperators() != null) { + for (Operator childOp : op.getChildOperators()) { + setInputFormat(work, childOp); + } + } + } + // loop over all the tasks recursviely + private void setInputFormat(Task task) { + if (task instanceof ExecDriver) { + MapredWork work = (MapredWork) task.getWork(); + HashMap> opMap = work.getAliasToWork(); + if (!opMap.isEmpty()) { + for (Operator op : opMap.values()) { + setInputFormat(work, op); + } + } + } else if (task instanceof ConditionalTask) { + List> listTasks = ((ConditionalTask) task).getListTasks(); + for (Task tsk : listTasks) { + setInputFormat(tsk); + } + } + + if (task.getChildTasks() != null) { + for (Task childTask : task.getChildTasks()) { + setInputFormat(childTask); + } + } + } + + // loop over all the tasks recursviely private void setKeyDescTaskTree(Task task) { if (task instanceof ExecDriver) {