Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1384204)
+++ conf/hive-default.xml.template (working copy)
@@ -432,7 +432,17 @@
job plan. If the multi group by query has common group by keys, it will be
optimized to generate single M/R job.
+
+ hive.map.groupby.sorted
+ false
+ If the bucketing/sorting properties of the table match the grouping key, whether to
+ perform the group by in the mapper by using BucketizedHiveInputFormat. The only downside to this
+ is that it limits the number of mappers to the number of files.
+
+
+
+
hive.join.emit.interval
1000
How many rows in the right-most join operand Hive should buffer before emitting the join result.
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1384204)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -385,6 +385,7 @@
HIVEMAPAGGRMEMORYTHRESHOLD("hive.map.aggr.hash.force.flush.memory.threshold", (float) 0.9),
HIVEMAPAGGRHASHMINREDUCTION("hive.map.aggr.hash.min.reduction", (float) 0.5),
HIVEMULTIGROUPBYSINGLEREDUCER("hive.multigroupby.singlereducer", true),
+ HIVE_MAP_GROUPBY_SORT("hive.map.groupby.sorted", false),
// for hive udtf operator
HIVEUDTFAUTOPROGRESS("hive.udtf.auto.progress", false),
Index: ql/src/test/results/clientpositive/groupby_sort_1.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_sort_1.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_sort_1.q.out (working copy)
@@ -0,0 +1,3186 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 select key, val from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 select key, val from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key
+-- matches the skewed key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key
+-- matches the skewed key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by even if the group by
+-- key is a superset of skewed key
+EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by even if the group by
+-- key is a superset of skewed key
+EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 11 1
+2 12 1
+3 13 1
+7 17 1
+8 18 1
+8 28 1
+PREHOOK: query: -- It should work for sub-queries
+EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- It should work for sub-queries
+EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- It should work for sub-queries with column aliases
+EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+PREHOOK: type: QUERY
+POSTHOOK: query: -- It should work for sub-queries with column aliases
+EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: 1
+ type: int
+ expr: key
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types int:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 1
+1 2 1
+1 3 1
+1 7 1
+1 8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:int:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 1
+2 1 12 1
+3 1 13 1
+7 1 17 1
+8 1 18 1
+8 1 28 1
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: (key + 1)
+ type: double
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:double:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 2.0 11 1
+2 3.0 12 1
+3 4.0 13 1
+7 8.0 17 1
+8 9.0 18 1
+8 9.0 28 1
+PREHOOK: query: -- it should not matter what follows the group by
+-- test various cases
+
+-- group by followed by another group by
+EXPLAIN EXTENDED
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- it should not matter what follows the group by
+-- test various cases
+
+-- group by followed by another group by
+EXPLAIN EXTENDED
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col2
+ Group By Operator
+ aggregations:
+ expr: sum(_col2)
+ bucketGroup: false
+ keys:
+ expr: (_col0 + _col0)
+ type: double
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: double
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types double:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+2.0 1
+4.0 1
+6.0 1
+14.0 1
+16.0 2
+PREHOOK: query: -- group by followed by a union
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a union
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 11 1
+1 11 1
+2 12 1
+2 12 1
+3 13 1
+3 13 1
+7 17 1
+7 17 1
+8 18 1
+8 18 1
+8 28 1
+8 28 1
+PREHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-3 is a root stage
+ Stage-2 depends on stages: Stage-3
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: (key + key)
+ type: double
+ expr: val
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: double
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ GatherStats: false
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1.0 11 1
+2.0 12 1
+3.0 13 1
+7.0 17 1
+8.0 18 1
+8.0 28 1
+2.0 11 1
+4.0 12 1
+6.0 13 1
+14.0 17 1
+16.0 18 1
+16.0 28 1
+PREHOOK: query: -- group by followed by a join
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a join
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ subq2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Needs Tagging: true
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4,_col5
+ columns.types string:string:bigint:string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 11 1 1 11 1
+2 12 1 2 12 1
+3 13 1 3 13 1
+7 17 1 7 17 1
+8 18 1 8 18 1
+8 18 1 8 28 1
+8 28 1 8 18 1
+8 28 1 8 28 1
+PREHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2
+ON subq1.key = subq2.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2
+ON subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL val) (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: val
+ type: string
+ expr: key
+ type: string
+ outputColumnNames: val, key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: val
+ type: string
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Needs Tagging: true
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4,_col5
+ columns.types string:string:bigint:string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T2 select key, val from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t2
+POSTHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T2 select key, val from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t2
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:int:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 1
+2 1 12 1
+3 1 13 1
+7 1 17 1
+8 1 18 1
+8 1 28 1
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2)))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ expr: 2
+ type: int
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string:int:string:int:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 2 1
+2 1 12 2 1
+3 1 13 2 1
+7 1 17 2 1
+8 1 18 2 1
+8 1 28 2 1
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 2)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) (+ (TOK_TABLE_OR_COL key) 2))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ expr: (key + 2)
+ type: double
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: double
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string:int:string:double:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 3.0 1
+2 1 12 4.0 1
+3 1 13 5.0 1
+7 1 17 9.0 1
+8 1 18 10.0 1
+8 1 28 10.0 1
+PREHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys
+EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys
+EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val) 2)))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: (key + 1)
+ type: double
+ expr: val
+ type: string
+ expr: 2
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ sort order: ++++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col4
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: double
+ expr: KEY._col2
+ type: string
+ expr: KEY._col3
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string:double:string:int:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 2.0 11 2 1
+2 3.0 12 2 1
+3 4.0 13 2 1
+7 8.0 17 2 1
+8 9.0 18 2 1
+8 9.0 28 2 1
Index: ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out
===================================================================
--- ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (revision 0)
+++ ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out (working copy)
@@ -0,0 +1,3521 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 select key, val from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 select key, val from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key
+-- matches the skewed key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key
+-- matches the skewed key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, count(1) FROM T1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by even if the group by
+-- key is a superset of skewed key
+EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by even if the group by
+-- key is a superset of skewed key
+EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, val, count(1) FROM T1 GROUP BY key, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 11 1
+2 12 1
+3 13 1
+7 17 1
+8 18 1
+8 28 1
+PREHOOK: query: -- It should work for sub-queries
+EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- It should work for sub-queries
+EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- It should work for sub-queries with column aliases
+EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+PREHOOK: type: QUERY
+POSTHOOK: query: -- It should work for sub-queries with column aliases
+EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: 1
+ type: int
+ expr: key
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types int:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT 1, key, count(1) FROM T1 GROUP BY 1, key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 1
+1 2 1
+1 3 1
+1 7 1
+1 8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:int:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 1
+2 1 12 1
+3 1 13 1
+7 1 17 1
+8 1 18 1
+8 1 28 1
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a function followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: (key + 1)
+ type: double
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:double:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 2.0 11 1
+2 3.0 12 1
+3 4.0 13 1
+7 8.0 17 1
+8 9.0 18 1
+8 9.0 28 1
+PREHOOK: query: -- it should not matter what follows the group by
+-- test various cases
+
+-- group by followed by another group by
+EXPLAIN EXTENDED
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- it should not matter what follows the group by
+-- test various cases
+
+-- group by followed by another group by
+EXPLAIN EXTENDED
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col2
+ Group By Operator
+ aggregations:
+ expr: sum(_col2)
+ bucketGroup: false
+ keys:
+ expr: (_col0 + _col0)
+ type: double
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: rand()
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: double
+ mode: partials
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types double,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types double,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types double,bigint
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: double
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types double:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+2.0 1
+4.0 1
+6.0 1
+14.0 1
+16.0 2
+PREHOOK: query: -- group by followed by a union
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a union
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 11 1
+1 11 1
+2 12 1
+2 12 1
+3 13 1
+3 13 1
+7 17 1
+7 17 1
+8 18 1
+8 18 1
+8 28 1
+8 28 1
+PREHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a union where one of the sub-queries is map-side group by
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-3 is a root stage
+ Stage-4 depends on stages: Stage-3
+ Stage-2 depends on stages: Stage-4
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: (key + key)
+ type: double
+ expr: val
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: rand()
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: double
+ expr: KEY._col1
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: double
+ expr: KEY._col1
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ GatherStats: false
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: double
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10003
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types double,string,bigint
+ escape.delim \
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1.0 11 1
+2.0 12 1
+3.0 13 1
+7.0 17 1
+8.0 18 1
+8.0 28 1
+2.0 11 1
+4.0 12 1
+6.0 13 1
+14.0 17 1
+16.0 18 1
+16.0 28 1
+PREHOOK: query: -- group by followed by a join
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a join
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ subq2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Needs Tagging: true
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4,_col5
+ columns.types string:string:bigint:string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 11 1 1 11 1
+2 12 1 2 12 1
+3 13 1 3 13 1
+7 17 1 7 17 1
+8 18 1 8 18 1
+8 18 1 8 28 1
+8 28 1 8 18 1
+8 28 1 8 28 1
+PREHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2
+ON subq1.key = subq2.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- group by followed by a join where one of the sub-queries can be performed in the mapper
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2
+ON subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL val) (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-3 depends on stages: Stage-2
+ Stage-1 depends on stages: Stage-3
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: val
+ type: string
+ expr: key
+ type: string
+ outputColumnNames: val, key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: val
+ type: string
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: rand()
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ subq1:t1
+ TableScan
+ alias: t1
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ Needs Tagging: true
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10003
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types string,string,bigint
+ escape.delim \
+#### A masked pattern was here ####
+ Partition
+ base file name: t1
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t1
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t1 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t1
+ name: default.t1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4,_col5
+ columns.types string:string:bigint:string:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T2 select key, val from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t2
+POSTHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T2 select key, val from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t2
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no mapside sort group by if the group by is a prefix of the sorted key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: rand()
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: partials
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string,bigint
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, count(1) FROM T2 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1
+2 1
+3 1
+7 1
+8 2
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:int:string:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 1
+2 1 12 1
+3 1 13 1
+7 1 17 1
+8 1 18 1
+8 1 28 1
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2)))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ expr: 2
+ type: int
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string:int:string:int:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 2 1
+2 1 12 2 1
+3 1 13 2 1
+7 1 17 2 1
+8 1 18 2 1
+8 1 28 2 1
+PREHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 2)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) (+ (TOK_TABLE_OR_COL key) 2))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: 1
+ type: int
+ expr: val
+ type: string
+ expr: (key + 2)
+ type: double
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ expr: _col2
+ type: string
+ expr: _col3
+ type: double
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string:int:string:double:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 1 11 3.0 1
+2 1 12 4.0 1
+3 1 13 5.0 1
+7 1 17 9.0 1
+8 1 18 10.0 1
+8 1 28 10.0 1
+PREHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys
+EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no map-side group by if the group by key contains a function in between the skewed keys
+EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1) (TOK_TABLE_OR_COL val) 2)))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t2
+ TableScan
+ alias: t2
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: key, val
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ expr: (key + 1)
+ type: double
+ expr: val
+ type: string
+ expr: 2
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ sort order: ++++
+ Map-reduce partition columns:
+ expr: rand()
+ type: double
+ tag: -1
+ value expressions:
+ expr: _col4
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: t2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,val
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.t2
+ numFiles 1
+ numPartitions 0
+ numRows 6
+ rawDataSize 24
+ serialization.ddl struct t2 { string key, string val}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 30
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.t2
+ name: default.t2
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: double
+ expr: KEY._col2
+ type: string
+ expr: KEY._col3
+ type: int
+ mode: partials
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string,double,string,int,bigint
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ sort order: ++++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col4
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string,double,string,int,bigint
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string,double,string,int,bigint
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: double
+ expr: KEY._col2
+ type: string
+ expr: KEY._col3
+ type: int
+ mode: final
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: double
+ expr: _col2
+ type: string
+ expr: _col3
+ type: int
+ expr: _col4
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3,_col4
+ columns.types string:double:string:int:bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 2.0 11 2 1
+2 3.0 12 2 1
+3 4.0 13 2 1
+7 8.0 17 2 1
+8 9.0 18 2 1
+8 9.0 28 2 1
Index: ql/src/test/queries/clientpositive/groupby_sort_1.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_sort_1.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_sort_1.q (working copy)
@@ -0,0 +1,138 @@
+set hive.enforce.bucketing = true;
+set hive.enforce.sorting = true;
+set hive.exec.reducers.max = 10;
+set hive.map.groupby.sorted=true;
+
+CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+-- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 select key, val from T1;
+
+-- The plan should be converted to a map-side group by if the group by key
+-- matches the skewed key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key;
+SELECT key, count(1) FROM T1 GROUP BY key;
+
+-- The plan should be converted to a map-side group by even if the group by
+-- key is a superset of skewed key
+EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val;
+SELECT key, val, count(1) FROM T1 GROUP BY key, val;
+
+-- It should work for sub-queries
+EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key;
+SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key;
+
+-- It should work for sub-queries with column aliases
+EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k;
+SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key;
+SELECT 1, key, count(1) FROM T1 GROUP BY 1, key;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val;
+SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val;
+
+-- The plan should be converted to a map-side group by if the group by key contains a function followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val;
+SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val;
+
+-- it should not matter what follows the group by
+-- test various cases
+
+-- group by followed by another group by
+EXPLAIN EXTENDED
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key;
+
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key;
+
+-- group by followed by a union
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1;
+
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1;
+
+-- group by followed by a union where one of the sub-queries is map-side group by
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1;
+
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1;
+
+-- group by followed by a join
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key;
+
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key;
+
+-- group by followed by a join where one of the sub-queries can be performed in the mapper
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2
+ON subq1.key = subq2.key;
+
+
+CREATE TABLE T2(key STRING, val STRING)
+CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE;
+
+-- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T2 select key, val from T1;
+
+-- no mapside sort group by if the group by is a prefix of the sorted key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key;
+SELECT key, count(1) FROM T2 GROUP BY key;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val;
+SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2;
+SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2;
+SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2;
+
+-- no map-side group by if the group by key contains a function in between the skewed keys
+EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2;
+SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2;
Property changes on: ql/src/test/queries/clientpositive/groupby_sort_1.q
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q
===================================================================
--- ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (revision 0)
+++ ql/src/test/queries/clientpositive/groupby_sort_skew_1.q (working copy)
@@ -0,0 +1,139 @@
+set hive.enforce.bucketing = true;
+set hive.enforce.sorting = true;
+set hive.exec.reducers.max = 10;
+set hive.map.groupby.sorted=true;
+set hive.groupby.skewindata=true;
+
+CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+-- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 select key, val from T1;
+
+-- The plan should be converted to a map-side group by if the group by key
+-- matches the skewed key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T1 GROUP BY key;
+SELECT key, count(1) FROM T1 GROUP BY key;
+
+-- The plan should be converted to a map-side group by even if the group by
+-- key is a superset of skewed key
+EXPLAIN EXTENDED SELECT key, val, count(1) FROM T1 GROUP BY key, val;
+SELECT key, val, count(1) FROM T1 GROUP BY key, val;
+
+-- It should work for sub-queries
+EXPLAIN EXTENDED SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key;
+SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key;
+
+-- It should work for sub-queries with column aliases
+EXPLAIN EXTENDED SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k;
+SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT 1, key, count(1) FROM T1 GROUP BY 1, key;
+SELECT 1, key, count(1) FROM T1 GROUP BY 1, key;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val;
+SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val;
+
+-- The plan should be converted to a map-side group by if the group by key contains a function followed
+-- by a match to the skewed key
+EXPLAIN EXTENDED SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val;
+SELECT key, key + 1, val, count(1) FROM T1 GROUP BY key, key + 1, val;
+
+-- it should not matter what follows the group by
+-- test various cases
+
+-- group by followed by another group by
+EXPLAIN EXTENDED
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key;
+
+SELECT key + key, sum(cnt) from
+(SELECT key, val, count(1) as cnt FROM T1 GROUP BY key, val) subq1
+group by key + key;
+
+-- group by followed by a union
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1;
+
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+) subq1;
+
+-- group by followed by a union where one of the sub-queries is map-side group by
+EXPLAIN EXTENDED
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1;
+
+SELECT * FROM (
+SELECT key, val, count(1) FROM T1 GROUP BY key, val
+ UNION ALL
+SELECT key + key as key, val, count(1) FROM T1 GROUP BY key + key, val
+) subq1;
+
+-- group by followed by a join
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key;
+
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2
+ON subq1.key = subq2.key;
+
+-- group by followed by a join where one of the sub-queries can be performed in the mapper
+EXPLAIN EXTENDED
+SELECT * FROM
+(SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq1
+JOIN
+(SELECT val, key, count(1) FROM T1 GROUP BY val, key) subq2
+ON subq1.key = subq2.key;
+
+
+CREATE TABLE T2(key STRING, val STRING)
+CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE;
+
+-- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T2 select key, val from T1;
+
+-- no mapside sort group by if the group by is a prefix of the sorted key
+EXPLAIN EXTENDED SELECT key, count(1) FROM T2 GROUP BY key;
+SELECT key, count(1) FROM T2 GROUP BY key;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys
+EXPLAIN EXTENDED SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val;
+SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2;
+SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2;
+
+-- The plan should be converted to a map-side group by if the group by key contains a constant in between the
+-- skewed keys followed by anything
+EXPLAIN EXTENDED SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2;
+SELECT key, 1, val, key + 2, count(1) FROM T2 GROUP BY key, 1, val, key + 2;
+
+-- no map-side group by if the group by key contains a function in between the skewed keys
+EXPLAIN EXTENDED SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2;
+SELECT key, key + 1, val, 2, count(1) FROM T2 GROUP BY key, key + 1, val, 2;
Property changes on: ql/src/test/queries/clientpositive/groupby_sort_skew_1.q
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupBySortOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupBySortOptimizer.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupBySortOptimizer.java (working copy)
@@ -0,0 +1,395 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * This transformation does group by optimization. If the grouping key is a superset
+ * of the bucketing and sorting keys of the underlying table in the same order, the
+ * group by can be be performed on the map-side completely.
+ */
+public class GroupBySortOptimizer implements Transform {
+
+ private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class
+ .getName());
+
+ public GroupBySortOptimizer() {
+ }
+
+ @Override
+ public ParseContext transform(ParseContext pctx) throws SemanticException {
+
+ Map opRules = new LinkedHashMap();
+ HiveConf conf = pctx.getConf();
+
+ if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
+ // process group-by pattern
+ opRules.put(new RuleRegExp("R1",
+ GroupByOperator.getOperatorName() + "%" +
+ ReduceSinkOperator.getOperatorName() + "%" +
+ GroupByOperator.getOperatorName() + "%"),
+ getMapSortedGroupbyProc(pctx));
+ } else {
+ // If hive.groupby.skewindata is set to true, the operator tree is as below
+ opRules.put(new RuleRegExp("R2",
+ GroupByOperator.getOperatorName() + "%" +
+ ReduceSinkOperator.getOperatorName() + "%" +
+ GroupByOperator.getOperatorName() + "%" +
+ ReduceSinkOperator.getOperatorName() + "%" +
+ GroupByOperator.getOperatorName() + "%"),
+ getMapSortedGroupbySkewProc(pctx));
+ }
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp =
+ new DefaultRuleDispatcher(getDefaultProc(), opRules, new GroupBySortOptimizerContext());
+ GraphWalker ogw = new DefaultGraphWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.addAll(pctx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+
+ return pctx;
+ }
+
+ private NodeProcessor getDefaultProc() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack,
+ NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
+ return null;
+ }
+ };
+ }
+
+ private NodeProcessor getMapSortedGroupbyProc(ParseContext pctx) {
+ return new SortGroupByProcessor(pctx);
+ }
+
+ private NodeProcessor getMapSortedGroupbySkewProc(ParseContext pctx) {
+ return new SortGroupBySkewProcessor(pctx);
+ }
+
+ /**
+ * SortGroupByProcessor.
+ *
+ */
+ public class SortGroupByProcessor implements NodeProcessor {
+
+ protected ParseContext pGraphContext;
+
+ public SortGroupByProcessor(ParseContext pGraphContext) {
+ this.pGraphContext = pGraphContext;
+ }
+
+ // Check if the group by operator has already been processed
+ protected boolean checkGroupByOperatorProcessed(
+ GroupBySortOptimizerContext groupBySortOptimizerContext,
+ GroupByOperator groupByOp) {
+
+ // The group by operator has already been processed
+ if (groupBySortOptimizerContext.getListGroupByOperatorsProcessed().contains(groupByOp)) {
+ return false;
+ }
+
+ groupBySortOptimizerContext.getListGroupByOperatorsProcessed().add(groupByOp);
+ return true;
+ }
+
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ // GBY,RS,GBY... (top to bottom)
+ GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 3);
+
+ if (checkGroupByOperatorProcessed((GroupBySortOptimizerContext)procCtx, groupByOp) &&
+ (checkSortGroupBy(groupByOp))) {
+ convertGroupByMapSideSortedGroupBy(groupByOp, 2);
+ }
+ return null;
+ }
+
+ // Should this group by be converted to a map-side group by, because the grouping keys for
+ // the base table for the group by matches the skewed keys
+ protected boolean checkSortGroupBy(GroupByOperator groupByOp)
+ throws SemanticException {
+
+ // if this is not a HASH groupby, return
+ if (groupByOp.getConf().getMode() != GroupByDesc.Mode.HASH) {
+ return false;
+ }
+
+ // Check all the operators in the stack. Currently, only SELECTs and FILTERs
+ // are allowed. A interface 'supportMapSideGroupBy has been added for the same
+ Operator extends OperatorDesc> currOp = groupByOp;
+ currOp = currOp.getParentOperators().get(0);
+
+ while (true) {
+ if (currOp.getParentOperators() == null) {
+ break;
+ }
+
+ if ((currOp.getParentOperators().size() > 1) ||
+ (!currOp.supportMapSideGroupBy())) {
+ return false;
+ }
+
+ currOp = currOp.getParentOperators().get(0);
+ }
+
+ // currOp now points to the top-most tablescan operator
+ TableScanOperator tableScanOp = (TableScanOperator)currOp;
+
+ // Create a mapping from the group by columns to the table columns
+ Map tableColsMapping = new HashMap();
+ Table table = pGraphContext.getTopToTable().get(currOp);
+ for (FieldSchema col : table.getAllCols()) {
+ tableColsMapping.put(col.getName(), col.getName());
+ }
+
+ while (currOp != groupByOp) {
+ Operator extends OperatorDesc> processOp = currOp;
+ currOp = currOp.getChildOperators().get(0);
+
+ // Filters don't change the column names - so, no need to do anything for them
+ if (processOp instanceof SelectOperator) {
+ SelectOperator selectOp = (SelectOperator)processOp;
+ SelectDesc selectDesc = selectOp.getConf();
+
+ if (selectDesc.isSelStarNoCompute()) {
+ continue;
+ }
+
+ // Only columns and constants can be selected
+ for (int pos = 0; pos < selectDesc.getColList().size(); pos++) {
+ ExprNodeDesc selectColList = selectDesc.getColList().get(pos);
+ if ((selectColList instanceof ExprNodeConstantDesc) ||
+ (selectColList instanceof ExprNodeNullDesc)) {
+ continue;
+ }
+
+ if (selectColList instanceof ExprNodeColumnDesc) {
+ String newValue =
+ tableColsMapping.get(((ExprNodeColumnDesc) selectColList).getColumn());
+ tableColsMapping.put(selectDesc.getOutputColumnNames().get(pos), newValue);
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
+ boolean sortGroupBy = true;
+ // compute groupby columns from groupby keys
+ List groupByCols = new ArrayList();
+ // If the group by expression is anything other than a list of columns,
+ // the sorting property is not obeyed
+ for (ExprNodeDesc expr : groupByOp.getConf().getKeys()) {
+ if (expr instanceof ExprNodeColumnDesc) {
+ groupByCols.add(tableColsMapping.get(((ExprNodeColumnDesc) expr).getColumn()));
+ }
+ // Constants and nulls are OK
+ else if ((expr instanceof ExprNodeConstantDesc) ||
+ (expr instanceof ExprNodeNullDesc)) {
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ if (groupByCols.size() == 0) {
+ return false;
+ }
+
+ if (!table.isPartitioned()) {
+ List sortCols = Utilities.getColumnNamesFromSortCols(table.getSortCols());
+ if (!matchSortColumns(groupByCols, sortCols)) {
+ return false;
+ }
+ } else {
+ PrunedPartitionList partsList = null;
+ try {
+ partsList = pGraphContext.getOpToPartList().get(tableScanOp);
+ if (partsList == null) {
+ partsList = PartitionPruner.prune(table,
+ pGraphContext.getOpToPartPruner().get(tableScanOp),
+ pGraphContext.getConf(),
+ table.getTableName(),
+ pGraphContext.getPrunedPartitions());
+ pGraphContext.getOpToPartList().put(tableScanOp, partsList);
+ }
+ } catch (HiveException e) {
+ LOG.error(StringUtils.stringifyException(e));
+ throw new SemanticException(e.getMessage(), e);
+ }
+
+ for (Partition part : partsList.getNotDeniedPartns()) {
+ List sortCols = part.getSortColNames();
+ if (!matchSortColumns(groupByCols, sortCols)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Given the group by keys, sort columns, this method
+ * determines if we can use sorted group by or not.
+ * We can use map-side sort group by
+ * if the bucketing and sorting columns are a prefix of groupby columns.
+ *
+ * @param groupByCols
+ * @param sortCols
+ * @return
+ * @throws SemanticException
+ */
+ private boolean matchSortColumns(
+ List groupByCols,
+ List sortCols) throws SemanticException {
+
+ if (sortCols == null || sortCols.size() == 0) {
+ return false;
+ }
+
+ if (sortCols.size() <= groupByCols.size()) {
+ int num = sortCols.size();
+ for (int i = 0; i < num; i++) {
+ if (!sortCols.get(i).equals(groupByCols.get(i))) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ // Convert the group by to a map-side group by
+ // The operators specified by depth and removed from the tree.
+ protected void convertGroupByMapSideSortedGroupBy(GroupByOperator groupByOp, int depth) {
+ Operator extends OperatorDesc> currOp = groupByOp;
+ for (int i = 0; i < depth; i++) {
+ if (currOp.getChildOperators().size() > 1) {
+ return;
+ }
+ currOp = currOp.getChildOperators().get(0);
+ }
+
+ groupByOp.setChildOperators(currOp.getChildOperators());
+
+ List> parentOps =
+ new ArrayList>();
+ parentOps.add(groupByOp);
+
+ for (Operator extends OperatorDesc> op : currOp.getChildOperators()) {
+ op.setParentOperators(parentOps);
+ }
+
+ // Use bucketized hive input format - that makes sure that one mapper reads the entire file
+ groupByOp.setUseBucketizedHiveInputFormat(true);
+ groupByOp.getConf().setMode(GroupByDesc.Mode.FINAL);
+ }
+ }
+
+ /**
+ * SortGroupByProcessor.
+ *
+ */
+ public class SortGroupBySkewProcessor extends SortGroupByProcessor {
+ public SortGroupBySkewProcessor(ParseContext pGraphContext) {
+ super(pGraphContext);
+ }
+
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ // GBY,RS,GBY,RS,GBY... (top to bottom)
+ GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 5);
+
+ if (checkGroupByOperatorProcessed((GroupBySortOptimizerContext)procCtx, groupByOp) &&
+ (checkSortGroupBy(groupByOp))) {
+ convertGroupByMapSideSortedGroupBy(groupByOp, 4);
+ }
+ return null;
+ }
+ }
+
+ public class GroupBySortOptimizerContext implements NodeProcessorCtx {
+ List listGroupByOperatorsProcessed;
+
+ public GroupBySortOptimizerContext() {
+ listGroupByOperatorsProcessed = new ArrayList();
+ }
+
+ public List getListGroupByOperatorsProcessed() {
+ return listGroupByOperatorsProcessed;
+ }
+
+ public void setListGroupByOperatorsProcessed(
+ List listGroupByOperatorsProcessed) {
+ this.listGroupByOperatorsProcessed = listGroupByOperatorsProcessed;
+ }
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (working copy)
@@ -285,8 +285,8 @@
bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias());
bucketMJCxt.setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class);
bucketMJCxt.setBigTablePartSpecToFileMapping(
- currMapJoinOp.getConf().getBigTablePartSpecToFileMapping());
- plan.setSmbJoin(currMapJoinOp instanceof SMBMapJoinOperator);
+ currMapJoinOp.getConf().getBigTablePartSpecToFileMapping());
+ plan.setUseBucketizedHiveInputFormat(currMapJoinOp instanceof SMBMapJoinOperator);
}
}
}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (working copy)
@@ -183,9 +183,9 @@
table);
if (topOp == null || (!(topOp instanceof TableScanOperator))) {
// this is in a sub-query.
- // In future, we need to infer subq's columns propery. For example
+ // In future, we need to infer subq's columns property. For example
// "select key, count(1)
- // from (from clustergroupbyselect key, value where ds='210') group by key, 3;",
+ // from (from clustergroupby select key, value where ds='210') group by key, 3;",
// even though the group by op is in a subquery, it can be changed to
// bucket groupby.
return;
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy)
@@ -61,6 +61,9 @@
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGROUPBY)) {
transformations.add(new GroupByOptimizer());
}
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT)) {
+ transformations.add(new GroupBySortOptimizer());
+ }
transformations.add(new SamplePruner());
transformations.add(new MapJoinProcessor());
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) {
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (working copy)
@@ -321,7 +321,7 @@
inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName();
}
- if (getWork().isSmbJoin()) {
+ if (getWork().isUseBucketizedHiveInputFormat()) {
inpFormat = BucketizedHiveInputFormat.class.getName();
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy)
@@ -160,4 +160,9 @@
public OperatorType getType() {
return OperatorType.FILTER;
}
+
+ @Override
+ public boolean supportMapSideGroupBy() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy)
@@ -100,4 +100,9 @@
public OperatorType getType() {
return OperatorType.SELECT;
}
+
+ @Override
+ public boolean supportMapSideGroupBy() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy)
@@ -103,6 +103,8 @@
seqId = 0;
}
+ private boolean useBucketizedHiveInputFormat;
+
public Operator() {
id = String.valueOf(seqId++);
}
@@ -1359,4 +1361,16 @@
return ret;
}
+
+ public boolean supportMapSideGroupBy() {
+ return false;
+ }
+
+ public boolean isUseBucketizedHiveInputFormat() {
+ return useBucketizedHiveInputFormat;
+ }
+
+ public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) {
+ this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (working copy)
@@ -89,7 +89,7 @@
// used to indicate the input is sorted, and so a BinarySearchRecordReader shoudl be used
private boolean inputFormatSorted = false;
- private transient boolean smbJoin;
+ private transient boolean useBucketizedHiveInputFormat;
public MapredWork() {
aliasToPartnInfo = new LinkedHashMap();
@@ -488,11 +488,11 @@
return returnList;
}
- public boolean isSmbJoin() {
- return smbJoin;
+ public boolean isUseBucketizedHiveInputFormat() {
+ return useBucketizedHiveInputFormat;
}
- public void setSmbJoin(boolean smbJoin) {
- this.smbJoin = smbJoin;
+ public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) {
+ this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat;
}
}
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1384204)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy)
@@ -70,8 +70,8 @@
import org.apache.hadoop.hive.ql.exec.RecordWriter;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.StatsTask;
-import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
@@ -7203,6 +7203,12 @@
setKeyDescTaskTree(rootTask);
}
+ // If a task contains an operator which instructs bucketizedhiveinputformat
+ // to be used, please do so
+ for (Task extends Serializable> rootTask : rootTasks) {
+ setInputFormat(rootTask);
+ }
+
PhysicalContext physicalContext = new PhysicalContext(conf,
getParseContext(), ctx, rootTasks, fetchTask);
PhysicalOptimizer physicalOptimizer = new PhysicalOptimizer(
@@ -7383,7 +7389,44 @@
}
}
+ private void setInputFormat(MapredWork work, Operator extends OperatorDesc> op) {
+ if (op.isUseBucketizedHiveInputFormat()) {
+ work.setUseBucketizedHiveInputFormat(true);
+ return;
+ }
+
+ if (op.getChildOperators() != null) {
+ for (Operator extends OperatorDesc> childOp : op.getChildOperators()) {
+ setInputFormat(work, childOp);
+ }
+ }
+ }
+
// loop over all the tasks recursviely
+ private void setInputFormat(Task extends Serializable> task) {
+ if (task instanceof ExecDriver) {
+ MapredWork work = (MapredWork) task.getWork();
+ HashMap> opMap = work.getAliasToWork();
+ if (!opMap.isEmpty()) {
+ for (Operator extends OperatorDesc> op : opMap.values()) {
+ setInputFormat(work, op);
+ }
+ }
+ } else if (task instanceof ConditionalTask) {
+ List> listTasks = ((ConditionalTask) task).getListTasks();
+ for (Task extends Serializable> tsk : listTasks) {
+ setInputFormat(tsk);
+ }
+ }
+
+ if (task.getChildTasks() != null) {
+ for (Task extends Serializable> childTask : task.getChildTasks()) {
+ setInputFormat(childTask);
+ }
+ }
+ }
+
+ // loop over all the tasks recursviely
private void setKeyDescTaskTree(Task extends Serializable> task) {
if (task instanceof ExecDriver) {