Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1387855) +++ conf/hive-default.xml.template (working copy) @@ -453,7 +453,17 @@ job plan. If the multi group by query has common group by keys, it will be optimized to generate single M/R job. + + hive.map.groupby.sorted + false + If the bucketing/sorting properties of the table exactly match the grouping key, whether to + perform the group by in the mapper by using BucketizedHiveInputFormat. The only downside to this + is that it limits the number of mappers to the number of files. + + + + hive.join.emit.interval 1000 How many rows in the right-most join operand Hive should buffer before emitting the join result. Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1387855) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -385,6 +385,7 @@ HIVEMAPAGGRMEMORYTHRESHOLD("hive.map.aggr.hash.force.flush.memory.threshold", (float) 0.9), HIVEMAPAGGRHASHMINREDUCTION("hive.map.aggr.hash.min.reduction", (float) 0.5), HIVEMULTIGROUPBYSINGLEREDUCER("hive.multigroupby.singlereducer", true), + HIVE_MAP_GROUPBY_SORT("hive.map.groupby.sorted", false), // for hive udtf operator HIVEUDTFAUTOPROGRESS("hive.udtf.auto.progress", false), Index: ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out =================================================================== --- ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out (revision 1387855) +++ ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out (working copy) @@ -294,7 +294,7 @@ Group By Operator aggregations: expr: sum(_count_of_l_shipdate) - bucketGroup: false + bucketGroup: true keys: expr: l_shipdate type: string @@ -1136,7 +1136,7 @@ Group By Operator aggregations: expr: sum(_count_of_l_shipdate) - bucketGroup: false + bucketGroup: true keys: expr: l_shipdate type: string @@ -1301,7 +1301,7 @@ Group By Operator aggregations: expr: sum(_count_of_key) - bucketGroup: false + bucketGroup: true keys: expr: key type: int @@ -1384,7 +1384,7 @@ Group By Operator aggregations: expr: sum(_count_of_key) - bucketGroup: false + bucketGroup: true keys: expr: key type: int @@ -3909,7 +3909,7 @@ Group By Operator aggregations: expr: sum(_count_of_key) - bucketGroup: false + bucketGroup: true keys: expr: key type: int Index: ql/src/test/results/clientpositive/groupby_sort_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_1.q.out (working copy) @@ -0,0 +1,3442 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: 1 + type: int + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 1 +1 2 1 +1 3 1 +1 7 1 +1 8 2 +PREHOOK: query: -- no map-side group by if the group by key contains a constant followed by another column +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by if the group by key contains a constant followed by another column +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + sort order: +++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + tag: -1 + value expressions: + expr: _col3 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: int + expr: KEY._col2 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- no map-side group by if the group by key contains a function +EXPLAIN EXTENDED SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by if the group by key contains a function +EXPLAIN EXTENDED SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: (key + 1) + type: double + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: double + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: double + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2.0 1 +2 3.0 1 +3 4.0 1 +7 8.0 1 +8 9.0 2 +PREHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +PREHOOK: type: QUERY +POSTHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: (_col0 + _col0) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: _col0 + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +2.0 1 +4.0 1 +6.0 1 +14.0 1 +16.0 2 +PREHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 +PREHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-2 depends on stages: Stage-3 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: (key + key) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: _col0 + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1.0 1 +2.0 1 +3.0 1 +7.0 1 +8.0 2 +2.0 1 +4.0 1 +6.0 1 +14.0 1 +16.0 2 +PREHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:bigint:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 1 1 +2 1 2 1 +3 1 3 1 +7 1 7 1 +8 2 8 2 +PREHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: string + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:bigint:string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t2 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + expr: 2 + type: int + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: int + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:int:string:int:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 2 1 +2 1 12 2 1 +3 1 13 2 1 +7 1 17 2 1 +8 1 18 2 1 +8 1 28 2 1 +PREHOOK: query: -- contants from sub-queries should work fine +EXPLAIN EXTENDED +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +PREHOOK: type: QUERY +POSTHOOK: query: -- contants from sub-queries should work fine +EXPLAIN EXTENDED +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq:t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- multiple levels of contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple levels of contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant) constant2) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2 constant3)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant3)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant3) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq:t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: _col0, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col3 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col3, _col2 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + expr: _col3 + type: int + expr: _col2 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2 11 1 +2 2 12 1 +3 2 13 1 +7 2 17 1 +8 2 18 1 +8 2 28 1 +PREHOOK: query: CREATE TABLE DEST1(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE DEST1(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@DEST1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE DEST2(key INT, val STRING, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE DEST2(key INT, val STRING, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@DEST2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: EXPLAIN +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: _col1 + type: string + expr: UDFToInteger(_col2) + type: int + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: true + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: true + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-4 + Stats-Aggr Operator + + +PREHOOK: query: FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest1 +PREHOOK: Output: default@dest2 +POSTHOOK: query: FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest1 +POSTHOOK: Output: default@dest2 +POSTHOOK: Lineage: dest1.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest2.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val SIMPLE [(t2)t2.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: select * from DEST1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: select * from DEST1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest2.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val SIMPLE [(t2)t2.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: select * from DEST2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: query: select * from DEST2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest2.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val SIMPLE [(t2)t2.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 Index: ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (working copy) @@ -0,0 +1,4124 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by even if the group by key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: partials + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 +PREHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +POSTHOOK: query: -- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: 1 + type: int + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 1 +1 2 1 +1 3 1 +1 7 1 +1 8 2 +PREHOOK: query: -- no map-side group by if the group by key contains a constant followed by another column +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by if the group by key contains a constant followed by another column +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + sort order: +++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col3 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: int + expr: KEY._col2 + type: string + mode: partials + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string,int,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + sort order: +++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + tag: -1 + value expressions: + expr: _col3 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string,int,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string,int,string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: int + expr: KEY._col2 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- no map-side group by if the group by key contains a function +EXPLAIN EXTENDED SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +PREHOOK: type: QUERY +POSTHOOK: query: -- no map-side group by if the group by key contains a function +EXPLAIN EXTENDED SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: (key + 1) + type: double + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: double + sort order: ++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: double + mode: partials + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,double,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: double + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,double,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,double,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: double + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: double + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2.0 1 +2 3.0 1 +3 4.0 1 +7 8.0 1 +8 9.0 2 +PREHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +PREHOOK: type: QUERY +POSTHOOK: query: -- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: (_col0 + _col0) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: _col0 + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +2.0 1 +4.0 1 +6.0 1 +14.0 1 +16.0 2 +PREHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 +PREHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-4 depends on stages: Stage-3 + Stage-2 depends on stages: Stage-4 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: (key + key) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + Map-reduce partition columns: + expr: _col0 + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: double + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: double + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types double:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10003 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types double,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1.0 1 +2.0 1 +3.0 1 +7.0 1 +8.0 2 +2.0 1 +4.0 1 +6.0 1 +14.0 1 +16.0 2 +PREHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:bigint:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 1 1 +2 1 2 1 +3 1 3 1 +7 1 7 1 +8 2 8 2 +PREHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-3 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq2:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: partials + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + subq1:t1 + TableScan + alias: t1 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10003 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string,string,bigint + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: t1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t1 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t1 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t1 + name: default.t1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: string + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:bigint:string:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t2 +POSTHOOK: query: -- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + expr: 2 + type: int + mode: final + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: int + expr: _col4 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3,_col4 + columns.types string:int:string:int:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 2 1 +2 1 12 2 1 +3 1 13 2 1 +7 1 17 2 1 +8 1 18 2 1 +8 1 28 2 1 +PREHOOK: query: -- contants from sub-queries should work fine +EXPLAIN EXTENDED +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +PREHOOK: type: QUERY +POSTHOOK: query: -- contants from sub-queries should work fine +EXPLAIN EXTENDED +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq:t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + expr: val + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 11 1 +2 1 12 1 +3 1 13 1 +7 1 17 1 +8 1 18 1 +8 1 28 1 +PREHOOK: query: -- multiple levels od contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple levels od contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant) constant2) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2 constant3)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant3)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant3) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq:t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: _col0, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col3 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col3, _col2 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + expr: _col3 + type: int + expr: _col2 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2 11 1 +2 2 12 1 +3 2 13 1 +7 2 17 1 +8 2 18 1 +8 2 28 1 +PREHOOK: query: -- multiple levels of contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple levels of contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant) constant2) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2 constant3)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant3)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant3) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq:t2 + TableScan + alias: t2 + GatherStats: false + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: _col0, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col3 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col3, _col2 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + expr: _col3 + type: int + expr: _col2 + type: string + mode: final + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: t2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,val + columns.types string:string +#### A masked pattern was here #### + name default.t2 + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 24 + serialization.ddl struct t2 { string key, string val} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 30 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.t2 + name: default.t2 + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 2 11 1 +2 2 12 1 +3 2 13 1 +7 2 17 1 +8 2 18 1 +8 2 28 1 +PREHOOK: query: CREATE TABLE DEST1(key INT, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE DEST1(key INT, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@DEST1 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE DEST2(key INT, val STRING, cnt INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE DEST2(key INT, val STRING, cnt INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@DEST2 +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: EXPLAIN +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + Stage-4 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-3 + Stage-5 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + t2 + TableScan + alias: t2 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: key, val + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: val + type: string + mode: final + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: _col1 + type: string + expr: UDFToInteger(_col2) + type: int + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: true + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: true + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: true + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-4 + Stats-Aggr Operator + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-5 + Stats-Aggr Operator + + +PREHOOK: query: FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +PREHOOK: Output: default@dest1 +PREHOOK: Output: default@dest2 +POSTHOOK: query: FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@dest1 +POSTHOOK: Output: default@dest2 +POSTHOOK: Lineage: dest1.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest2.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val SIMPLE [(t2)t2.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: select * from DEST1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: select * from DEST1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest2.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val SIMPLE [(t2)t2.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: select * from DEST2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: query: select * from DEST2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.cnt EXPRESSION [(t2)t2.null, ] +POSTHOOK: Lineage: dest2.key EXPRESSION [(t2)t2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val SIMPLE [(t2)t2.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ] +1 11 1 +2 12 1 +3 13 1 +7 17 1 +8 18 1 +8 28 1 Index: ql/src/test/results/clientpositive/bucket_groupby.q.out =================================================================== --- ql/src/test/results/clientpositive/bucket_groupby.q.out (revision 1387855) +++ ql/src/test/results/clientpositive/bucket_groupby.q.out (working copy) @@ -60,7 +60,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: key type: string @@ -185,7 +185,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: key type: string @@ -289,7 +289,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: length(key) type: int @@ -384,7 +384,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: abs(length(key)) type: int @@ -481,7 +481,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: key type: string @@ -700,7 +700,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: key type: string @@ -1102,7 +1102,7 @@ Group By Operator aggregations: expr: count(1) - bucketGroup: true + bucketGroup: false keys: expr: key type: string Index: ql/src/test/results/clientpositive/metadataonly1.q.out =================================================================== --- ql/src/test/results/clientpositive/metadataonly1.q.out (revision 1387855) +++ ql/src/test/results/clientpositive/metadataonly1.q.out (working copy) @@ -30,7 +30,7 @@ Group By Operator aggregations: expr: max(ds) - bucketGroup: false + bucketGroup: true mode: hash outputColumnNames: _col0 Reduce Output Operator Index: ql/src/test/queries/clientpositive/groupby_sort_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_1.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_1.q (working copy) @@ -0,0 +1,174 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +-- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key; +SELECT key, count(1) FROM T1 GROUP BY key; + +-- no map-side group by even if the group by key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val; +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +-- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; +SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; + +-- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; +SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; + +-- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; +SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; + +-- no map-side group by if the group by key contains a constant followed by another column +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; + +-- no map-side group by if the group by key contains a function +EXPLAIN EXTENDED SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1; +SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1; + +-- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key; + +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key; + +-- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1; + +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1; + +-- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1; + +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1; + +-- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key; + +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key; + +-- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key; + + +CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1; + +-- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key; +SELECT key, count(1) FROM T2 GROUP BY key; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; +SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; + +-- contants from sub-queries should work fine +EXPLAIN EXTENDED +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val; + +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val; + +-- multiple levels of contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val; + +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val; + +set hive.map.aggr=true; +set hive.multigroupby.singlereducer=false; +set mapred.reduce.tasks=31; + +CREATE TABLE DEST1(key INT, cnt INT); +CREATE TABLE DEST2(key INT, val STRING, cnt INT); + +SET hive.exec.compress.intermediate=true; +SET hive.exec.compress.output=true; + +EXPLAIN +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; + +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; + +select * from DEST1; +select * from DEST2; Index: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (working copy) @@ -0,0 +1,191 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 10; +set hive.map.groupby.sorted=true; +set hive.groupby.skewindata=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 select key, val from T1; + +-- The plan should be converted to a map-side group by if the group by key +-- matches the skewed key +EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key; +SELECT key, count(1) FROM T1 GROUP BY key; + +-- no map-side group by even if the group by key is a superset of skewed key +EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val; +SELECT key, val, count(1) FROM T1 GROUP BY key, val; + +-- It should work for sub-queries +EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; +SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; + +-- It should work for sub-queries with column aliases +EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; +SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; + +-- The plan should be converted to a map-side group by if the group by key contains a constant followed +-- by a match to the skewed key +EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; +SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; + +-- no map-side group by if the group by key contains a constant followed by another column +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; + +-- no map-side group by if the group by key contains a function +EXPLAIN EXTENDED SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1; +SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1; + +-- it should not matter what follows the group by +-- test various cases + +-- group by followed by another group by +EXPLAIN EXTENDED +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key; + +SELECT key + key, sum(cnt) from +(SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 +group by key + key; + +-- group by followed by a union +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1; + +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key, count(1) FROM T1 GROUP BY key +) subq1; + +-- group by followed by a union where one of the sub-queries is map-side group by +EXPLAIN EXTENDED +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1; + +SELECT * FROM ( +SELECT key, count(1) FROM T1 GROUP BY key + UNION ALL +SELECT key + key as key, count(1) FROM T1 GROUP BY key + key +) subq1; + +-- group by followed by a join +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key; + +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, count(1) FROM T1 GROUP BY key) subq2 +ON subq1.key = subq2.key; + +-- group by followed by a join where one of the sub-queries can be performed in the mapper +EXPLAIN EXTENDED +SELECT * FROM +(SELECT key, count(1) FROM T1 GROUP BY key) subq1 +JOIN +(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 +ON subq1.key = subq2.key; + + +CREATE TABLE T2(key STRING, val STRING) +CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T2 select key, val from T1; + +-- no mapside sort group by if the group by is a prefix of the sorted key +EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key; +SELECT key, count(1) FROM T2 GROUP BY key; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys +EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; +SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; + +-- The plan should be converted to a map-side group by if the group by key contains a constant in between the +-- skewed keys followed by anything +EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; +SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; + +-- contants from sub-queries should work fine +EXPLAIN EXTENDED +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val; + +SELECT key, constant, val, count(1) from +(SELECT key, 1 as constant, val from T2)subq +group by key, constant, val; + +-- multiple levels od contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val; + +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val; + +-- multiple levels of contants from sub-queries should work fine +EXPLAIN EXTENDED +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val; + +select key, constant3, val, count(1) from +( +SELECT key, constant as constant2, val, 2 as constant3 from +(SELECT key, 1 as constant, val from T2)subq +)subq2 +group by key, constant3, val; + +set hive.map.aggr=true; +set hive.multigroupby.singlereducer=false; +set mapred.reduce.tasks=31; + +CREATE TABLE DEST1(key INT, cnt INT); +CREATE TABLE DEST2(key INT, val STRING, cnt INT); + +SET hive.exec.compress.intermediate=true; +SET hive.exec.compress.output=true; + +EXPLAIN +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; + +FROM T2 +INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key +INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; + +select * from DEST1; +select * from DEST2; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (working copy) @@ -285,8 +285,8 @@ bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias()); bucketMJCxt.setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class); bucketMJCxt.setBigTablePartSpecToFileMapping( - currMapJoinOp.getConf().getBigTablePartSpecToFileMapping()); - plan.setSmbJoin(currMapJoinOp instanceof SMBMapJoinOperator); + currMapJoinOp.getConf().getBigTablePartSpecToFileMapping()); + plan.setUseBucketizedHiveInputFormat(currMapJoinOp instanceof SMBMapJoinOperator); } } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (working copy) @@ -19,8 +19,9 @@ package org.apache.hadoop.hive.ql.optimizer; import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; @@ -28,11 +29,13 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; -import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -53,15 +56,16 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.util.StringUtils; /** - *this transformation does bucket group by optimization. + * This transformation does group by optimization. If the grouping key is a superset + * of the bucketing and sorting keys of the underlying table in the same order, the + * group by can be be performed on the map-side completely. */ public class GroupByOptimizer implements Transform { @@ -75,19 +79,31 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { Map opRules = new LinkedHashMap(); - GroupByOptProcCtx groupByOptimizeCtx = new GroupByOptProcCtx(); + HiveConf conf = pctx.getConf(); - // process group-by pattern - opRules.put(new RuleRegExp("R1", - GroupByOperator.getOperatorName() + "%" - + ReduceSinkOperator.getOperatorName() + "%" - + GroupByOperator.getOperatorName() + "%"), - getMapAggreSortedGroupbyProc(pctx)); + if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + // process group-by pattern + opRules.put(new RuleRegExp("R1", + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%"), + getMapSortedGroupbyProc(pctx)); + } else { + // If hive.groupby.skewindata is set to true, the operator tree is as below + opRules.put(new RuleRegExp("R2", + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%"), + getMapSortedGroupbySkewProc(pctx)); + } // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along - Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, - groupByOptimizeCtx); + Dispatcher disp = + new DefaultRuleDispatcher(getDefaultProc(), opRules, + new GroupByOptimizerContext(conf)); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes @@ -102,212 +118,322 @@ return new NodeProcessor() { @Override public Object process(Node nd, Stack stack, - NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { return null; } }; } - private NodeProcessor getMapAggreSortedGroupbyProc(ParseContext pctx) { - return new BucketGroupByProcessor(pctx); + private NodeProcessor getMapSortedGroupbyProc(ParseContext pctx) { + return new SortGroupByProcessor(pctx); } + private NodeProcessor getMapSortedGroupbySkewProc(ParseContext pctx) { + return new SortGroupBySkewProcessor(pctx); + } + + public enum GroupByOptimizerSortMatch { + NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH + }; + /** - * BucketGroupByProcessor. + * SortGroupByProcessor. * */ - public class BucketGroupByProcessor implements NodeProcessor { + public class SortGroupByProcessor implements NodeProcessor { protected ParseContext pGraphContext; - public BucketGroupByProcessor(ParseContext pGraphContext) { + public SortGroupByProcessor(ParseContext pGraphContext) { this.pGraphContext = pGraphContext; } + // Check if the group by operator has already been processed + protected boolean checkGroupByOperatorProcessed( + GroupByOptimizerContext groupBySortOptimizerContext, + GroupByOperator groupByOp) { + + // The group by operator has already been processed + if (groupBySortOptimizerContext.getListGroupByOperatorsProcessed().contains(groupByOp)) { + return true; + } + + groupBySortOptimizerContext.getListGroupByOperatorsProcessed().add(groupByOp); + return false; + } + + protected void processGroupBy(GroupByOptimizerContext ctx, + Stack stack, + GroupByOperator groupByOp, + int depth) throws SemanticException { + HiveConf hiveConf = ctx.getConf(); + GroupByOptimizerSortMatch match = checkSortGroupBy(stack, groupByOp); + boolean useMapperSort = + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT); + + if (useMapperSort) { + if (match == GroupByOptimizerSortMatch.COMPLETE_MATCH) { + convertGroupByMapSideSortedGroupBy(groupByOp, depth); + } + } + else if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) || + (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { + groupByOp.getConf().setBucketGroup(true); + } + } + @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // GBY,RS,GBY... (top to bottom) - GroupByOperator op = (GroupByOperator) stack.get(stack.size() - 3); - checkBucketGroupBy(op); + GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 3); + + GroupByOptimizerContext ctx = (GroupByOptimizerContext)procCtx; + + if (!checkGroupByOperatorProcessed(ctx, groupByOp)) { + processGroupBy(ctx, stack, groupByOp, 2); + } return null; } - private void checkBucketGroupBy(GroupByOperator curr) - throws SemanticException { + // Should this group by be converted to a map-side group by, because the grouping keys for + // the base table for the group by matches the skewed keys + protected GroupByOptimizerSortMatch checkSortGroupBy(Stack stack, + GroupByOperator groupByOp) + throws SemanticException { // if this is not a HASH groupby, return - if (curr.getConf().getMode() != GroupByDesc.Mode.HASH) { - return; + if (groupByOp.getConf().getMode() != GroupByDesc.Mode.HASH) { + return GroupByOptimizerSortMatch.NO_MATCH; } - Set tblNames = pGraphContext.getGroupOpToInputTables().get(curr); - if (tblNames == null || tblNames.size() == 0) { - return; + // Check all the operators in the stack. Currently, only SELECTs and FILTERs + // are allowed. A interface 'supportMapSideGroupBy has been added for the same + Operator currOp = groupByOp; + currOp = currOp.getParentOperators().get(0); + + while (true) { + if (currOp.getParentOperators() == null) { + break; + } + + if ((currOp.getParentOperators().size() > 1) || + (!currOp.supportMapSideGroupBy())) { + return GroupByOptimizerSortMatch.NO_MATCH; + } + + currOp = currOp.getParentOperators().get(0); } - boolean bucketGroupBy = true; - GroupByDesc desc = curr.getConf(); - List groupByKeys = new LinkedList(); - groupByKeys.addAll(desc.getKeys()); + // currOp now points to the top-most tablescan operator + TableScanOperator tableScanOp = (TableScanOperator)currOp; + int stackPos = 0; + assert stack.get(0) == tableScanOp; + + // Create a mapping from the group by columns to the table columns + Map tableColsMapping = new HashMap(); + Set constantCols = new HashSet(); + Table table = pGraphContext.getTopToTable().get(currOp); + for (FieldSchema col : table.getAllCols()) { + tableColsMapping.put(col.getName(), col.getName()); + } + + while (currOp != groupByOp) { + Operator processOp = currOp; + Set newConstantCols = new HashSet(); + currOp = (Operator)(stack.get(++stackPos)); + + // Filters don't change the column names - so, no need to do anything for them + if (processOp instanceof SelectOperator) { + SelectOperator selectOp = (SelectOperator)processOp; + SelectDesc selectDesc = selectOp.getConf(); + + if (selectDesc.isSelStarNoCompute()) { + continue; + } + + // Only columns and constants can be selected + for (int pos = 0; pos < selectDesc.getColList().size(); pos++) { + String outputColumnName = selectDesc.getOutputColumnNames().get(pos); + if (constantCols.contains(outputColumnName)) { + tableColsMapping.remove(outputColumnName); + newConstantCols.add(outputColumnName); + continue; + } + + ExprNodeDesc selectColList = selectDesc.getColList().get(pos); + if (selectColList instanceof ExprNodeColumnDesc) { + String newValue = + tableColsMapping.get(((ExprNodeColumnDesc) selectColList).getColumn()); + tableColsMapping.put(outputColumnName, newValue); + } + else { + tableColsMapping.remove(outputColumnName); + if ((selectColList instanceof ExprNodeConstantDesc) || + (selectColList instanceof ExprNodeNullDesc)) { + newConstantCols.add(outputColumnName); + } + } + } + + constantCols = newConstantCols; + } + } + + boolean sortGroupBy = true; // compute groupby columns from groupby keys List groupByCols = new ArrayList(); - while (groupByKeys.size() > 0) { - ExprNodeDesc node = groupByKeys.remove(0); - if (node instanceof ExprNodeColumnDesc) { - groupByCols.addAll(node.getCols()); - } else if ((node instanceof ExprNodeConstantDesc) - || (node instanceof ExprNodeNullDesc)) { - // nothing - } else if (node instanceof ExprNodeFieldDesc) { - groupByKeys.add(0, ((ExprNodeFieldDesc) node).getDesc()); + // If the group by expression is anything other than a list of columns, + // the sorting property is not obeyed + for (ExprNodeDesc expr : groupByOp.getConf().getKeys()) { + if (expr instanceof ExprNodeColumnDesc) { + String groupByKeyColumn = ((ExprNodeColumnDesc)expr).getColumn(); + // ignore if it is a constant + if (constantCols.contains(groupByKeyColumn)) { + continue; + } + else { + if (tableColsMapping.containsKey(groupByKeyColumn)) { + groupByCols.add(tableColsMapping.get(groupByKeyColumn)); + } + else { + return GroupByOptimizerSortMatch.NO_MATCH; + } + } + } + // Constants and nulls are OK + else if ((expr instanceof ExprNodeConstantDesc) || + (expr instanceof ExprNodeNullDesc)) { continue; - } else if (node instanceof ExprNodeGenericFuncDesc) { - ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node); - GenericUDF udf = udfNode.getGenericUDF(); - if (!FunctionRegistry.isDeterministic(udf)) { - return; - } - groupByKeys.addAll(0, udfNode.getChildExprs()); } else { - return; + return GroupByOptimizerSortMatch.NO_MATCH; } } - if (groupByCols.size() == 0) { - return; - } + if (!table.isPartitioned()) { + List sortCols = Utilities.getColumnNamesFromSortCols(table.getSortCols()); + return matchSortColumns(groupByCols, sortCols); + } else { + PrunedPartitionList partsList = null; + try { + partsList = pGraphContext.getOpToPartList().get(tableScanOp); + if (partsList == null) { + partsList = PartitionPruner.prune(table, + pGraphContext.getOpToPartPruner().get(tableScanOp), + pGraphContext.getConf(), + table.getTableName(), + pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartList().put(tableScanOp, partsList); + } + } catch (HiveException e) { + LOG.error(StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } - for (String table : tblNames) { - Operator topOp = pGraphContext.getTopOps().get( - table); - if (topOp == null || (!(topOp instanceof TableScanOperator))) { - // this is in a sub-query. - // In future, we need to infer subq's columns propery. For example - // "select key, count(1) - // from (from clustergroupbyselect key, value where ds='210') group by key, 3;", - // even though the group by op is in a subquery, it can be changed to - // bucket groupby. - return; - } - TableScanOperator ts = (TableScanOperator) topOp; - Table destTable = pGraphContext.getTopToTable().get(ts); - if (destTable == null) { - return; - } - if (!destTable.isPartitioned()) { - List bucketCols = destTable.getBucketCols(); - List sortCols = Utilities - .getColumnNamesFromSortCols(destTable.getSortCols()); - bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols, - sortCols); - if (!bucketGroupBy) { - return; + GroupByOptimizerSortMatch currentMatch = GroupByOptimizerSortMatch.COMPLETE_MATCH; + for (Partition part : partsList.getNotDeniedPartns()) { + List sortCols = part.getSortColNames(); + GroupByOptimizerSortMatch match = matchSortColumns(groupByCols, sortCols); + if (match == GroupByOptimizerSortMatch.NO_MATCH) { + return match; } - } else { - PrunedPartitionList partsList = null; - try { - partsList = pGraphContext.getOpToPartList().get(ts); - if (partsList == null) { - partsList = PartitionPruner.prune(destTable, pGraphContext - .getOpToPartPruner().get(ts), pGraphContext.getConf(), table, - pGraphContext.getPrunedPartitions()); - pGraphContext.getOpToPartList().put(ts, partsList); - } - } catch (HiveException e) { - // Has to use full name to make sure it does not conflict with - // org.apache.commons.lang.StringUtils - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - throw new SemanticException(e.getMessage(), e); + + if (match == GroupByOptimizerSortMatch.PARTIAL_MATCH) { + currentMatch = match; } - List parts = new ArrayList(); - parts.addAll(partsList.getConfirmedPartns()); - parts.addAll(partsList.getUnknownPartns()); - for (Partition part : parts) { - List bucketCols = part.getBucketCols(); - List sortCols = part.getSortColNames(); - bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols, - sortCols); - if (!bucketGroupBy) { - return; - } - } } + return currentMatch; } - - curr.getConf().setBucketGroup(bucketGroupBy); } /** - * Given the group by keys, bucket columns, sort column, this method + * Given the group by keys, sort columns, this method * determines if we can use sorted group by or not. + * We can use map-side sort group by group by columns match the sorted columns + * in exactly the same order. * - * We use bucket columns only when the sorted column set is empty and if all - * group by columns are contained in bucket columns. - * - * If we can can not determine by looking at bucketed columns and the table - * has sort columns, we resort to sort columns. We can use bucket group by - * if the groupby column set is an exact prefix match of sort columns. - * * @param groupByCols - * @param bucketCols * @param sortCols * @return * @throws SemanticException */ - private boolean matchBucketOrSortedColumns(List groupByCols, - List bucketCols, List sortCols) throws SemanticException { - boolean ret = false; + private GroupByOptimizerSortMatch matchSortColumns( + List groupByCols, + List sortCols) throws SemanticException { if (sortCols == null || sortCols.size() == 0) { - ret = matchBucketColumns(groupByCols, bucketCols); + return GroupByOptimizerSortMatch.NO_MATCH; } - if (!ret && sortCols != null && sortCols.size() >= groupByCols.size()) { - // check sort columns, if groupByCols is a prefix subset of sort - // columns, we will use sorted group by. For example, if data is sorted - // by column a, b, c, and a query wants to group by b,a, we will use - // sorted group by. But if the query wants to groupby b,c, then sorted - // group by can not be used. - int num = groupByCols.size(); - for (int i = 0; i < num; i++) { - if (sortCols.indexOf(groupByCols.get(i)) > (num - 1)) { - return false; - } + int num = sortCols.size() < groupByCols.size() ? sortCols.size() : groupByCols.size(); + for (int i = 0; i < num; i++) { + if (!sortCols.get(i).equals(groupByCols.get(i))) { + return GroupByOptimizerSortMatch.NO_MATCH; } - return true; } - return ret; + return sortCols.size() == groupByCols.size() ? + GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH; } - /* - * All group by columns should be contained in the bucket column set. And - * the number of group by columns should be equal to number of bucket - * columns. - */ - private boolean matchBucketColumns(List grpCols, - List tblBucketCols) throws SemanticException { - - if (tblBucketCols == null || tblBucketCols.size() == 0 - || grpCols.size() == 0 || grpCols.size() != tblBucketCols.size()) { - return false; + // Convert the group by to a map-side group by + // The operators specified by depth and removed from the tree. + protected void convertGroupByMapSideSortedGroupBy(GroupByOperator groupByOp, int depth) { + if (groupByOp.removeChildren(depth)) { + // Use bucketized hive input format - that makes sure that one mapper reads the entire file + groupByOp.setUseBucketizedHiveInputFormat(true); + groupByOp.getConf().setMode(GroupByDesc.Mode.FINAL); } - - for (int i = 0; i < grpCols.size(); i++) { - String tblCol = grpCols.get(i); - if (!tblBucketCols.contains(tblCol)) { - return false; - } - } - return true; } } /** - * GroupByOptProcCtx. + * SortGroupByProcessor. * */ - public class GroupByOptProcCtx implements NodeProcessorCtx { + public class SortGroupBySkewProcessor extends SortGroupByProcessor { + public SortGroupBySkewProcessor(ParseContext pGraphContext) { + super(pGraphContext); + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + // GBY,RS,GBY,RS,GBY... (top to bottom) + GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 5); + GroupByOptimizerContext ctx = (GroupByOptimizerContext)procCtx; + + if (!checkGroupByOperatorProcessed(ctx, groupByOp)) { + processGroupBy(ctx, stack, groupByOp, 4); + } + return null; + } } + + public class GroupByOptimizerContext implements NodeProcessorCtx { + List listGroupByOperatorsProcessed; + HiveConf conf; + + public GroupByOptimizerContext(HiveConf conf) { + this.conf = conf; + listGroupByOperatorsProcessed = new ArrayList(); + } + + public List getListGroupByOperatorsProcessed() { + return listGroupByOperatorsProcessed; + } + + public void setListGroupByOperatorsProcessed( + List listGroupByOperatorsProcessed) { + this.listGroupByOperatorsProcessed = listGroupByOperatorsProcessed; + } + + public HiveConf getConf() { + return conf; + } + + public void setConf(HiveConf conf) { + this.conf = conf; + } + } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -61,7 +61,8 @@ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGBYUSINGINDEX)) { transformations.add(new RewriteGBUsingIndex()); } - if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGROUPBY)) { + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGROUPBY) || + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT)) { transformations.add(new GroupByOptimizer()); } transformations.add(new SamplePruner()); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (working copy) @@ -321,7 +321,7 @@ inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName(); } - if (getWork().isSmbJoin()) { + if (getWork().isUseBucketizedHiveInputFormat()) { inpFormat = BucketizedHiveInputFormat.class.getName(); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy) @@ -165,4 +165,9 @@ public boolean supportSkewJoinOptimization() { return true; } + + @Override + public boolean supportMapSideGroupBy() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy) @@ -105,4 +105,9 @@ public boolean supportSkewJoinOptimization() { return true; } + + @Override + public boolean supportMapSideGroupBy() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (working copy) @@ -120,8 +120,6 @@ // Used by hash distinct aggregations when hashGrpKeyNotRedKey is true protected transient HashSet keysCurrentGroup; - transient boolean bucketGroup; - transient boolean firstRow; transient long totalMemory; transient boolean hashAggr; @@ -329,9 +327,8 @@ objectInspectors.add(roi); } - bucketGroup = conf.getBucketGroup(); aggregationsParametersLastInvoke = new Object[conf.getAggregators().size()][]; - if (conf.getMode() != GroupByDesc.Mode.HASH || bucketGroup) { + if (conf.getMode() != GroupByDesc.Mode.HASH || conf.getBucketGroup()) { aggregations = newAggregations(); hashAggr = false; } else { @@ -808,7 +805,6 @@ boolean keysAreEqual = (currentKeys != null && newKeys != null)? newKeys.equals(currentKeys) : false; - // Forward the current keys if needed for sort-based aggregation if (currentKeys != null && !keysAreEqual) { forward(currentKeys.getKeyArray(), aggregations); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -103,6 +103,8 @@ seqId = 0; } + private boolean useBucketizedHiveInputFormat; + public Operator() { id = String.valueOf(seqId++); } @@ -708,6 +710,31 @@ } } + // Remove the operators till a certain depth. + // Return true if the remove was successful, false otherwise + public boolean removeChildren(int depth) { + Operator currOp = this; + for (int i = 0; i < depth; i++) { + // If there are more than 1 children at any level, don't do anything + if ((currOp.getChildOperators() == null) || + (currOp.getChildOperators().size() > 1)) { + return false; + } + currOp = currOp.getChildOperators().get(0); + } + + setChildOperators(currOp.getChildOperators()); + + List> parentOps = + new ArrayList>(); + parentOps.add(this); + + for (Operator op : currOp.getChildOperators()) { + op.setParentOperators(parentOps); + } + return true; + } + /** * Replace one parent with another at the same position. Chilren of the new * parent are not updated @@ -1376,4 +1403,16 @@ return ret; } + + public boolean supportMapSideGroupBy() { + return false; + } + + public boolean isUseBucketizedHiveInputFormat() { + return useBucketizedHiveInputFormat; + } + + public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) { + this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (working copy) @@ -52,6 +52,8 @@ private Mode mode; private boolean groupKeyNotReductionKey; + + // no hash aggregations for group by private boolean bucketGroup; private ArrayList keys; @@ -177,8 +179,8 @@ return bucketGroup; } - public void setBucketGroup(boolean dataSorted) { - bucketGroup = dataSorted; + public void setBucketGroup(boolean bucketGroup) { + this.bucketGroup = bucketGroup; } /** Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (working copy) @@ -89,7 +89,7 @@ // used to indicate the input is sorted, and so a BinarySearchRecordReader shoudl be used private boolean inputFormatSorted = false; - private transient boolean smbJoin; + private transient boolean useBucketizedHiveInputFormat; public MapredWork() { aliasToPartnInfo = new LinkedHashMap(); @@ -488,11 +488,11 @@ return returnList; } - public boolean isSmbJoin() { - return smbJoin; + public boolean isUseBucketizedHiveInputFormat() { + return useBucketizedHiveInputFormat; } - public void setSmbJoin(boolean smbJoin) { - this.smbJoin = smbJoin; + public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) { + this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat; } } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1387855) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -7211,6 +7211,12 @@ setKeyDescTaskTree(rootTask); } + // If a task contains an operator which instructs bucketizedhiveinputformat + // to be used, please do so + for (Task rootTask : rootTasks) { + setInputFormat(rootTask); + } + PhysicalContext physicalContext = new PhysicalContext(conf, getParseContext(), ctx, rootTasks, fetchTask); PhysicalOptimizer physicalOptimizer = new PhysicalOptimizer( @@ -7391,7 +7397,44 @@ } } + private void setInputFormat(MapredWork work, Operator op) { + if (op.isUseBucketizedHiveInputFormat()) { + work.setUseBucketizedHiveInputFormat(true); + return; + } + + if (op.getChildOperators() != null) { + for (Operator childOp : op.getChildOperators()) { + setInputFormat(work, childOp); + } + } + } + // loop over all the tasks recursviely + private void setInputFormat(Task task) { + if (task instanceof ExecDriver) { + MapredWork work = (MapredWork) task.getWork(); + HashMap> opMap = work.getAliasToWork(); + if (!opMap.isEmpty()) { + for (Operator op : opMap.values()) { + setInputFormat(work, op); + } + } + } else if (task instanceof ConditionalTask) { + List> listTasks = ((ConditionalTask) task).getListTasks(); + for (Task tsk : listTasks) { + setInputFormat(tsk); + } + } + + if (task.getChildTasks() != null) { + for (Task childTask : task.getChildTasks()) { + setInputFormat(childTask); + } + } + } + + // loop over all the tasks recursviely private void setKeyDescTaskTree(Task task) { if (task instanceof ExecDriver) {