Index: ql/src/test/results/clientpositive/skewjoinopt.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt.q.out (working copy) @@ -0,0 +1,4412 @@ +PREHOOK: query: CREATE TABLE tst1(key INT, value STRING) SKEWED BY (key) ON ((469),(348),(230)) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tst1(key INT, value STRING) SKEWED BY (key) ON ((469),(348),(230)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tst1 +PREHOOK: query: INSERT OVERWRITE TABLE tst1 SELECT a.key, a.value FROM src a +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tst1 +POSTHOOK: query: INSERT OVERWRITE TABLE tst1 SELECT a.key, a.value FROM src a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tst1 +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE tst2(key INT, value STRING) SKEWED BY (key) ON ((469),(348),(230)) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tst2(key INT, value STRING) SKEWED BY (key) ON ((469),(348),(230)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tst2 +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: INSERT OVERWRITE TABLE tst2 SELECT a.key, a.value FROM src a +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tst2 +POSTHOOK: query: INSERT OVERWRITE TABLE tst2 SELECT a.key, a.value FROM src a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tst2 +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE skewjoin_tmp_result (value STRING, key INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE skewjoin_tmp_result (value STRING, key INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE skewjoin_hash_result_1 (key BIGINT, value BIGINT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE skewjoin_hash_result_1 (key BIGINT, value BIGINT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE skewjoin_hash_result_2 (key BIGINT, value BIGINT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE skewjoin_hash_result_2 (key BIGINT, value BIGINT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_tmp_result))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-9 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery:a + TableScan + alias: a + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (not ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + subquery:b + TableScan + alias: b + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (not ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + name: default.skewjoin_tmp_result + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + name: default.skewjoin_tmp_result + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230)) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + b + TableScan + alias: b + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230)) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +1028 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +PREHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +POSTHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_tmp_result))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result +#### A masked pattern was here #### + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +1028 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +PREHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +POSTHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_hash_result_1 +PREHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_hash_result_1 +POSTHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +0 0 +PREHOOK: query: CREATE TABLE skewjoin_tmp_result2 (value_count INT,key INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE skewjoin_tmp_result2 (value_count INT,key INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@skewjoin_tmp_result2 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_tmp_result2))) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL a) value))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL a) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: count(_col1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types int,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types int,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types int,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: bigint + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value_count,key + columns.types int:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result2 + serialization.ddl struct skewjoin_tmp_result2 { i32 value_count, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result2 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value_count,key + columns.types int:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result2 + serialization.ddl struct skewjoin_tmp_result2 { i32 value_count, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result2 +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Output: default@skewjoin_tmp_result2 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Output: default@skewjoin_tmp_result2 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +1028 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value_count)) FROM skewjoin_tmp_result2 +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result2 +PREHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value_count)) FROM skewjoin_tmp_result2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result2 +POSTHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_tmp_result2))) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count (. (TOK_TABLE_OR_COL a) value))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL a) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: count(_col1) + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types int,bigint + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types int,bigint + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types int,bigint + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: bigint + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: UDFToInteger(_col0) + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value_count,key + columns.types int:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result2 + numFiles 1 + numPartitions 0 + numRows 309 + rawDataSize 1492 + serialization.ddl struct skewjoin_tmp_result2 { i32 value_count, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 1801 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result2 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value_count,key + columns.types int:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result2 + numFiles 1 + numPartitions 0 + numRows 309 + rawDataSize 1492 + serialization.ddl struct skewjoin_tmp_result2 { i32 value_count, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 1801 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result2 +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Output: default@skewjoin_tmp_result2 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Output: default@skewjoin_tmp_result2 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result2 +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result2 +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +309 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value_count)) FROM skewjoin_tmp_result2 +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result2 +PREHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value_count)) FROM skewjoin_tmp_result2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result2 +POSTHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_hash_result_1 +PREHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_hash_result_1 +POSTHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +0 0 +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst2) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key))))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)))))) u)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_tmp_result))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-9 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:u-subquery1:a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + null-subquery1:u-subquery1:b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 +#### A masked pattern was here #### + Partition + base file name: tst2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst2 + name: default.tst2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + name: default.skewjoin_tmp_result + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 10968 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11996 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + name: default.skewjoin_tmp_result + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:u-subquery2:a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + null-subquery2:u-subquery2:b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 +#### A masked pattern was here #### + Partition + base file name: tst2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst2 + name: default.tst2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Input: default@tst2 +PREHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Input: default@tst2 +POSTHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +2056 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +PREHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +POSTHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst2) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key))))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)))))) u)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_tmp_result))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-9 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:u-subquery1:a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + null-subquery1:u-subquery1:b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 +#### A masked pattern was here #### + Partition + base file name: tst2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst2 + name: default.tst2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + name: default.skewjoin_tmp_result + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns value,key + columns.types string:int +#### A masked pattern was here #### + name default.skewjoin_tmp_result + numFiles 1 + numPartitions 0 + numRows 2056 + rawDataSize 21936 + serialization.ddl struct skewjoin_tmp_result { string value, i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 23992 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_tmp_result + name: default.skewjoin_tmp_result + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:u-subquery2:a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + expr: value + type: string + null-subquery2:u-subquery2:b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 +#### A masked pattern was here #### + Partition + base file name: tst2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst2 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst2 + name: default.tst2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + handleSkewJoin: false + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string,int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Input: default@tst2 +PREHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Input: default@tst2 +POSTHOOK: Output: default@skewjoin_tmp_result +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +2056 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +PREHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +POSTHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_hash_result_1 +PREHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_hash_result_1 +POSTHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +0 0 +PREHOOK: query: CREATE TABLE skewjoin_add_result (key INT) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE skewjoin_add_result (key INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@skewjoin_add_result +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_add_result))) (TOK_SELECT (TOK_SELEXPR (+ (. (TOK_TABLE_OR_COL b) key) (. (TOK_TABLE_OR_COL a) key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-9 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery:a + TableScan + alias: a + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (not ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + subquery:b + TableScan + alias: b + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (not ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col0, _col4 + Select Operator + expressions: + expr: (_col4 + _col0) + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false +#### A masked pattern was here #### + TableScan + GatherStats: false + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result +#### A masked pattern was here #### + + Stage: Stage-3 + Stats-Aggr Operator +#### A masked pattern was here #### + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + name: default.skewjoin_add_result + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10003 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + name: default.skewjoin_add_result + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230)) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + b + TableScan + alias: b + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((((((key = 469) or (key = 348)) or (key = 230)) or (key = 469)) or (key = 348)) or (key = 230)) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col0, _col4 + Select Operator + expressions: + expr: (_col4 + _col0) + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Output: default@skewjoin_add_result +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Output: default@skewjoin_add_result +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +2056 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(key)) FROM skewjoin_tmp_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_tmp_result +PREHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(key)) FROM skewjoin_tmp_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_tmp_result +POSTHOOK: Output: default@skewjoin_hash_result_1 +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tst1) a) (TOK_TABREF (TOK_TABNAME tst1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME skewjoin_add_result))) (TOK_SELECT (TOK_SELEXPR (+ (. (TOK_TABLE_OR_COL b) key) (. (TOK_TABLE_OR_COL a) key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 0 + value expressions: + expr: key + type: int + b + TableScan + alias: b + GatherStats: false + Reduce Output Operator + key expressions: + expr: key + type: int + sort order: + + Map-reduce partition columns: + expr: key + type: int + tag: 1 + value expressions: + expr: key + type: int + Needs Tagging: true + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: tst1 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.tst1 + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 5312 + serialization.ddl struct tst1 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.tst1 + name: default.tst1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col0} + handleSkewJoin: false + outputColumnNames: _col0, _col4 + Select Operator + expressions: + expr: (_col4 + _col0) + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 3000 + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 4028 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key + columns.types int +#### A masked pattern was here #### + name default.skewjoin_add_result + numFiles 1 + numPartitions 0 + numRows 1028 + rawDataSize 3000 + serialization.ddl struct skewjoin_add_result { i32 key} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 4028 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.skewjoin_add_result +#### A masked pattern was here #### + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + + +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tst1 +PREHOOK: Output: default@skewjoin_add_result +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tst1 +POSTHOOK: Output: default@skewjoin_add_result +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT count(1) FROM skewjoin_add_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_add_result +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM skewjoin_add_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_add_result +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +1028 +PREHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(key)) FROM skewjoin_add_result +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_add_result +PREHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: query: INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(key)) FROM skewjoin_add_result +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_add_result +POSTHOOK: Output: default@skewjoin_hash_result_2 +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_add_result)skewjoin_add_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_add_result)skewjoin_add_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@skewjoin_hash_result_1 +PREHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@skewjoin_hash_result_1 +POSTHOOK: Input: default@skewjoin_hash_result_2 +#### A masked pattern was here #### +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_add_result.key EXPRESSION [(tst1)b.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_1.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.key EXPRESSION [(skewjoin_add_result)skewjoin_add_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result2)skewjoin_tmp_result2.FieldSchema(name:value_count, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_tmp_result)skewjoin_tmp_result.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_hash_result_2.value EXPRESSION [(skewjoin_add_result)skewjoin_add_result.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.key EXPRESSION [(tst2)a.FieldSchema(name:key, type:int, comment:null), (tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value SIMPLE [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result.value EXPRESSION [(tst2)a.FieldSchema(name:value, type:string, comment:null), (tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.key SIMPLE [(tst1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: skewjoin_tmp_result2.value_count EXPRESSION [(tst1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tst1.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst1.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.key EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tst2.value SIMPLE [(src)a.FieldSchema(name:value, type:string, comment:default), ] +0 0 Index: ql/src/test/queries/clientpositive/skewjoinopt.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt.q (working copy) @@ -0,0 +1,164 @@ +set hive.internal.ddl.list.bucketing.enable=true; + +CREATE TABLE tst1(key INT, value STRING) SKEWED BY (key) ON ((469),(348),(230)); + +INSERT OVERWRITE TABLE tst1 SELECT a.key, a.value FROM src a; + +CREATE TABLE tst2(key INT, value STRING) SKEWED BY (key) ON ((469),(348),(230)); +INSERT OVERWRITE TABLE tst2 SELECT a.key, a.value FROM src a; + +CREATE TABLE skewjoin_tmp_result (value STRING, key INT); + +CREATE TABLE skewjoin_hash_result_1 (key BIGINT, value BIGINT); +CREATE TABLE skewjoin_hash_result_2 (key BIGINT, value BIGINT); + +set hive.optimize.skewjoinoptimization = true; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +SELECT count(1) FROM skewjoin_tmp_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result; + + +set hive.optimize.skewjoinoptimization = false; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT a.value,a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +SELECT count(1) FROM skewjoin_tmp_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result; + + +SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key; + + + +CREATE TABLE skewjoin_tmp_result2 (value_count INT,key INT); + +set hive.optimize.skewjoinoptimization = false; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key; + +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key; + +SELECT count(1) FROM skewjoin_tmp_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value_count)) FROM skewjoin_tmp_result2; + +set hive.optimize.skewjoinoptimization = false; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key; + +INSERT OVERWRITE TABLE skewjoin_tmp_result2 +SELECT count(a.value),a.key FROM tst1 a JOIN tst1 b ON a.key = b.key GROUP BY a.key; + +SELECT count(1) FROM skewjoin_tmp_result2; +INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value_count)) FROM skewjoin_tmp_result2; + + +SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key; + + +set hive.optimize.skewjoinoptimization = true; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u; + +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u; + +SELECT count(1) FROM skewjoin_tmp_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result; + + +set hive.optimize.skewjoinoptimization = false; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u; + +INSERT OVERWRITE TABLE skewjoin_tmp_result +SELECT * FROM ( + SELECT a.value,a.key FROM tst2 a JOIN tst1 b ON a.key = b.key + UNION ALL + SELECT a.value,a.key FROM tst1 a JOIN tst2 b ON a.key = b.key + ) u; + + +SELECT count(1) FROM skewjoin_tmp_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(value)) FROM skewjoin_tmp_result; + + +SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key; + + +CREATE TABLE skewjoin_add_result (key INT); + +set hive.optimize.skewjoinoptimization = true; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +SELECT count(1) FROM skewjoin_tmp_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_1 +SELECT sum(hash(key)), sum(hash(key)) FROM skewjoin_tmp_result; + + +set hive.optimize.skewjoinoptimization = false; + +EXPLAIN EXTENDED +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +INSERT OVERWRITE TABLE skewjoin_add_result +SELECT b.key + a.key FROM tst1 a JOIN tst1 b ON a.key = b.key; + +SELECT count(1) FROM skewjoin_add_result; +INSERT OVERWRITE TABLE skewjoin_hash_result_2 +SELECT sum(hash(key)), sum(hash(key)) FROM skewjoin_add_result; + +SELECT a.key - b.key, a.value - b.value +FROM skewjoin_hash_result_1 a LEFT OUTER JOIN skewjoin_hash_result_2 b +ON a.key = b.key; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java (working copy) @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - table scan. */ @@ -62,17 +63,17 @@ TableScanOperator op = (TableScanOperator) nd; GenMRProcContext ctx = (GenMRProcContext) opProcCtx; ParseContext parseCtx = ctx.getParseCtx(); - Map, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx(); + Map, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx(); // create a dummy MapReduce task MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx); Task currTask = TaskFactory.get(currWork, parseCtx.getConf()); - Operator currTopOp = op; + Operator currTopOp = op; ctx.setCurrTask(currTask); ctx.setCurrTopOp(currTopOp); for (String alias : parseCtx.getTopOps().keySet()) { - Operator currOp = parseCtx.getTopOps().get(alias); + Operator currOp = parseCtx.getTopOps().get(alias); if (currOp == op) { String currAliasId = alias; ctx.setCurrAliasId(currAliasId); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink4.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink4.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink4.java (working copy) @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - map join followed by reduce sink. @@ -43,7 +44,7 @@ /** * Reduce Scan encountered. - * + * * @param nd * the reduce sink operator encountered * @param opProcCtx @@ -58,13 +59,13 @@ // map-join consisted on a bunch of map-only jobs, and it has been split // after the mapjoin - Operator reducer = op.getChildOperators().get(0); - Map, GenMapRedCtx> mapCurrCtx = ctx + Operator reducer = op.getChildOperators().get(0); + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0)); Task currTask = mapredCtx.getCurrTask(); MapredWork plan = (MapredWork) currTask.getWork(); - HashMap, Task> opTaskMap = ctx + HashMap, Task> opTaskMap = ctx .getOpTaskMap(); Task opMapTask = opTaskMap.get(reducer); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java (working copy) @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.ql.optimizer.unionproc; -import java.io.Serializable; import java.util.List; import java.util.Stack; @@ -28,6 +27,7 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Operator factory for union processing. @@ -42,9 +42,9 @@ int pos = 0; int size = stack.size(); assert size >= 2 && stack.get(size - 1) == union; - Operator parent = (Operator) stack - .get(size - 2); - List> parUnion = union + Operator parent = + (Operator) stack.get(size - 2); + List> parUnion = union .getParentOperators(); pos = parUnion.indexOf(parent); assert pos < parUnion.size(); @@ -145,8 +145,8 @@ int start = stack.size() - 2; UnionOperator parentUnionOperator = null; while (start >= 0) { - Operator parent = - (Operator) stack.get(start); + Operator parent = + (Operator) stack.get(start); if (parent instanceof UnionOperator) { parentUnionOperator = (UnionOperator) parent; break; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -74,6 +73,7 @@ import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; @@ -97,8 +97,8 @@ } @SuppressWarnings("nls") - private Operator putOpInsertMap(Operator op, - RowResolver rr) { + private Operator + putOpInsertMap(Operator op, RowResolver rr) { OpParseContext ctx = new OpParseContext(rr); pGraphContext.getOpParseCtx().put(op, ctx); return op; @@ -120,18 +120,18 @@ // create a new MapredLocalWork MapredLocalWork newLocalWork = new MapredLocalWork( - new LinkedHashMap>(), + new LinkedHashMap>(), new LinkedHashMap()); - for (Map.Entry> entry : newWork.getAliasToWork() - .entrySet()) { + for (Map.Entry> entry : + newWork.getAliasToWork().entrySet()) { String alias = entry.getKey(); - Operator op = entry.getValue(); + Operator op = entry.getValue(); // if the table scan is for big table; then skip it // tracing down the operator tree from the table scan operator - Operator parentOp = op; - Operator childOp = op.getChildOperators().get(0); + Operator parentOp = op; + Operator childOp = op.getChildOperators().get(0); while ((childOp != null) && (!childOp.equals(mapJoinOp))) { parentOp = childOp; assert parentOp.getChildOperators().size() == 1; @@ -218,10 +218,10 @@ } public static String genMapJoinOpAndLocalWork(MapredWork newWork, JoinOperator op, int mapJoinPos) - throws SemanticException { + throws SemanticException { try { - LinkedHashMap, OpParseContext> opParseCtxMap = newWork - .getOpParseCtxMap(); + LinkedHashMap, OpParseContext> opParseCtxMap = + newWork.getOpParseCtxMap(); QBJoinTree newJoinTree = newWork.getJoinTree(); // generate the map join operator; already checked the map join MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op, @@ -256,9 +256,9 @@ * @param noCheckOuterJoin */ public static MapJoinOperator convertMapJoin( - LinkedHashMap, OpParseContext> opParseCtxMap, - JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) - throws SemanticException { + LinkedHashMap, OpParseContext> opParseCtxMap, + JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) + throws SemanticException { // outer join cannot be performed on a table which is being cached JoinDesc desc = op.getConf(); JoinCondDesc[] condns = desc.getConds(); @@ -279,18 +279,22 @@ // The join outputs a concatenation of all the inputs. QBJoinTree leftSrc = joinTree.getJoinSrc(); - List> parentOps = op.getParentOperators(); - List> newParentOps = new ArrayList>(); - List> oldReduceSinkParentOps = new ArrayList>(); + List> parentOps = op.getParentOperators(); + List> newParentOps = + new ArrayList>(); + List> oldReduceSinkParentOps = + new ArrayList>(); Map colExprMap = new HashMap(); - HashMap> columnTransfer = new HashMap>(); + HashMap> columnTransfer = + new HashMap>(); // found a source which is not to be stored in memory if (leftSrc != null) { // assert mapJoinPos == 0; - Operator parentOp = parentOps.get(0); + Operator parentOp = parentOps.get(0); assert parentOp.getParentOperators().size() == 1; - Operator grandParentOp = parentOp.getParentOperators().get(0); + Operator grandParentOp = + parentOp.getParentOperators().get(0); oldReduceSinkParentOps.add(parentOp); grandParentOp.removeChild(parentOp); newParentOps.add(grandParentOp); @@ -300,9 +304,10 @@ // Remove parent reduce-sink operators for (String src : joinTree.getBaseSrc()) { if (src != null) { - Operator parentOp = parentOps.get(pos); + Operator parentOp = parentOps.get(pos); assert parentOp.getParentOperators().size() == 1; - Operator grandParentOp = parentOp.getParentOperators().get(0); + Operator grandParentOp = + parentOp.getParentOperators().get(0); grandParentOp.removeChild(parentOp); oldReduceSinkParentOps.add(parentOp); @@ -389,7 +394,7 @@ Operator[] newPar = new Operator[newParentOps.size()]; pos = 0; - for (Operator o : newParentOps) { + for (Operator o : newParentOps) { newPar[pos++] = o; } @@ -461,8 +466,8 @@ // change the children of the original join operator to point to the map // join operator - List> childOps = op.getChildOperators(); - for (Operator childOp : childOps) { + List> childOps = op.getChildOperators(); + for (Operator childOp : childOps) { childOp.replaceParent(op, mapJoinOp); } @@ -482,7 +487,7 @@ && HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN); - LinkedHashMap, OpParseContext> opParseCtxMap = pctx + LinkedHashMap, OpParseContext> opParseCtxMap = pctx .getOpParseCtx(); MapJoinOperator mapJoinOp = convertMapJoin(opParseCtxMap, op, joinTree, mapJoinPos, noCheckOuterJoin); @@ -577,7 +582,7 @@ } private void genSelectPlan(ParseContext pctx, MapJoinOperator input) throws SemanticException { - List> childOps = input.getChildOperators(); + List> childOps = input.getChildOperators(); input.setChildOperators(null); // create a dummy select - This select is needed by the walker to split the @@ -613,7 +618,7 @@ // Insert the select operator in between. sel.setChildOperators(childOps); - for (Operator ch : childOps) { + for (Operator ch : childOps) { ch.replaceParent(input, sel); } } @@ -764,12 +769,12 @@ } private Boolean findGrandChildSubqueryMapjoin(MapJoinWalkerCtx ctx, MapJoinOperator mapJoin) { - Operator parent = mapJoin; + Operator parent = mapJoin; while (true) { if (parent.getChildOperators() == null || parent.getChildOperators().size() != 1) { return null; } - Operator ch = parent.getChildOperators().get(0); + Operator ch = parent.getChildOperators().get(0); if (ch instanceof MapJoinOperator) { if (!nonSubqueryMapJoin(ctx.getpGraphContext(), (MapJoinOperator) ch, mapJoin)) { if (ch.getParentOperators().indexOf(parent) == ((MapJoinOperator) ch).getConf() Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java (working copy) @@ -41,6 +41,7 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor Context for creating map reduce task. Walk the tree in a DFS manner @@ -54,7 +55,7 @@ */ public static class GenMapRedCtx { Task currTask; - Operator currTopOp; + Operator currTopOp; String currAliasId; public GenMapRedCtx() { @@ -69,7 +70,7 @@ * the current alias for the to operator */ public GenMapRedCtx(Task currTask, - Operator currTopOp, String currAliasId) { + Operator currTopOp, String currAliasId) { this.currTask = currTask; this.currTopOp = currTopOp; this.currAliasId = currAliasId; @@ -85,7 +86,7 @@ /** * @return current top operator */ - public Operator getCurrTopOp() { + public Operator getCurrTopOp() { return currTopOp; } @@ -105,13 +106,13 @@ Task uTask; List taskTmpDir; List tt_desc; - List> listTopOperators; + List> listTopOperators; public GenMRUnionCtx() { uTask = null; taskTmpDir = new ArrayList(); tt_desc = new ArrayList(); - listTopOperators = new ArrayList>(); + listTopOperators = new ArrayList>(); } public Task getUTask() { @@ -138,16 +139,16 @@ return tt_desc; } - public List> getListTopOperators() { + public List> getListTopOperators() { return listTopOperators; } public void setListTopOperators( - List> listTopOperators) { + List> listTopOperators) { this.listTopOperators = listTopOperators; } - public void addListTopOperators(Operator topOperator) { + public void addListTopOperators(Operator topOperator) { listTopOperators.add(topOperator); } } @@ -159,7 +160,7 @@ public static class GenMRMapJoinCtx { String taskTmpDir; TableDesc tt_desc; - Operator rootMapJoinOp; + Operator rootMapJoinOp; AbstractMapJoinOperator oldMapJoin; public GenMRMapJoinCtx() { @@ -176,7 +177,7 @@ * @param oldMapJoin */ public GenMRMapJoinCtx(String taskTmpDir, TableDesc tt_desc, - Operator rootMapJoinOp, + Operator rootMapJoinOp, AbstractMapJoinOperator oldMapJoin) { this.taskTmpDir = taskTmpDir; this.tt_desc = tt_desc; @@ -203,7 +204,7 @@ /** * @return the childSelect */ - public Operator getRootMapJoinOp() { + public Operator getRootMapJoinOp() { return rootMapJoinOp; } @@ -211,7 +212,7 @@ * @param rootMapJoinOp * the rootMapJoinOp to set */ - public void setRootMapJoinOp(Operator rootMapJoinOp) { + public void setRootMapJoinOp(Operator rootMapJoinOp) { this.rootMapJoinOp = rootMapJoinOp; } @@ -232,23 +233,24 @@ } private HiveConf conf; - private HashMap, Task> opTaskMap; + private + HashMap, Task> opTaskMap; private HashMap unionTaskMap; private HashMap, GenMRMapJoinCtx> mapJoinTaskMap; - private List> seenOps; + private List> seenOps; private List seenFileSinkOps; private ParseContext parseCtx; private List> mvTask; private List> rootTasks; - private LinkedHashMap, GenMapRedCtx> mapCurrCtx; + private LinkedHashMap, GenMapRedCtx> mapCurrCtx; private Task currTask; - private Operator currTopOp; + private Operator currTopOp; private UnionOperator currUnionOp; private AbstractMapJoinOperator currMapJoinOp; private String currAliasId; - private List> rootOps; + private List> rootOps; private DependencyCollectionTask dependencyTaskForMultiInsert; /** @@ -287,11 +289,11 @@ */ public GenMRProcContext( HiveConf conf, - HashMap, Task> opTaskMap, - List> seenOps, ParseContext parseCtx, + HashMap, Task> opTaskMap, + List> seenOps, ParseContext parseCtx, List> mvTask, List> rootTasks, - LinkedHashMap, GenMapRedCtx> mapCurrCtx, + LinkedHashMap, GenMapRedCtx> mapCurrCtx, Set inputs, Set outputs) { this.conf = conf; this.opTaskMap = opTaskMap; @@ -307,7 +309,7 @@ currUnionOp = null; currMapJoinOp = null; currAliasId = null; - rootOps = new ArrayList>(); + rootOps = new ArrayList>(); rootOps.addAll(parseCtx.getTopOps().values()); unionTaskMap = new HashMap(); mapJoinTaskMap = new HashMap, GenMRMapJoinCtx>(); @@ -317,7 +319,8 @@ /** * @return reducer to task mapping */ - public HashMap, Task> getOpTaskMap() { + public HashMap, + Task> getOpTaskMap() { return opTaskMap; } @@ -326,14 +329,14 @@ * reducer to task mapping */ public void setOpTaskMap( - HashMap, Task> opTaskMap) { + HashMap, Task> opTaskMap) { this.opTaskMap = opTaskMap; } /** * @return operators already visited */ - public List> getSeenOps() { + public List> getSeenOps() { return seenOps; } @@ -348,7 +351,7 @@ * @param seenOps * operators already visited */ - public void setSeenOps(List> seenOps) { + public void setSeenOps(List> seenOps) { this.seenOps = seenOps; } @@ -363,7 +366,7 @@ /** * @return top operators for tasks */ - public List> getRootOps() { + public List> getRootOps() { return rootOps; } @@ -371,7 +374,7 @@ * @param rootOps * top operators for tasks */ - public void setRootOps(List> rootOps) { + public void setRootOps(List> rootOps) { this.rootOps = rootOps; } @@ -423,7 +426,7 @@ /** * @return operator to task mappings */ - public LinkedHashMap, GenMapRedCtx> getMapCurrCtx() { + public LinkedHashMap, GenMapRedCtx> getMapCurrCtx() { return mapCurrCtx; } @@ -432,7 +435,7 @@ * operator to task mappings */ public void setMapCurrCtx( - LinkedHashMap, GenMapRedCtx> mapCurrCtx) { + LinkedHashMap, GenMapRedCtx> mapCurrCtx) { this.mapCurrCtx = mapCurrCtx; } @@ -454,7 +457,7 @@ /** * @return current top operator */ - public Operator getCurrTopOp() { + public Operator getCurrTopOp() { return currTopOp; } @@ -462,7 +465,7 @@ * @param currTopOp * current top operator */ - public void setCurrTopOp(Operator currTopOp) { + public void setCurrTopOp(Operator currTopOp) { this.currTopOp = currTopOp; } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GlobalLimitOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GlobalLimitOptimizer.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GlobalLimitOptimizer.java (working copy) @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.optimizer; +import java.util.Map; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -35,10 +37,8 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.SplitSample; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; -import java.io.Serializable; -import java.util.Map; - /** * This optimizer is used to reduce the input size for the query for queries which are * specifying a limit. @@ -58,7 +58,7 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { Context ctx = pctx.getContext(); - Map> topOps = pctx.getTopOps(); + Map> topOps = pctx.getTopOps(); GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx(); Map opToPartPruner = pctx.getOpToPartPruner(); Map opToPartList = pctx.getOpToPartList(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java (working copy) @@ -39,8 +39,8 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRUnionCtx; import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext; +import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcFactory; -import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; @@ -50,6 +50,7 @@ import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - TableScan followed by Union. @@ -89,7 +90,7 @@ } else { ctx.getMapCurrCtx().put( - (Operator) union, + (Operator) union, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrTopOp(), ctx.getCurrAliasId())); } @@ -127,8 +128,8 @@ * @param uCtxTask */ private void processSubQueryUnionCreateIntermediate( - Operator parent, - Operator child, + Operator parent, + Operator child, Task uTask, GenMRProcContext ctx, GenMRUnionCtx uCtxTask) { ParseContext parseCtx = ctx.getParseCtx(); @@ -141,21 +142,23 @@ String taskTmpDir = baseCtx.getMRTmpFileURI(); // Create a file sink operator for this file name - Operator fs_op = OperatorFactory.get( + Operator fs_op = OperatorFactory.get( new FileSinkDesc(taskTmpDir, tt_desc, parseCtx.getConf().getBoolVar( HiveConf.ConfVars.COMPRESSINTERMEDIATE)), parent.getSchema()); assert parent.getChildOperators().size() == 1; parent.getChildOperators().set(0, fs_op); - List> parentOpList = new ArrayList>(); + List> parentOpList = + new ArrayList>(); parentOpList.add(parent); fs_op.setParentOperators(parentOpList); // Create a dummy table scan operator - Operator ts_op = OperatorFactory.get( + Operator ts_op = OperatorFactory.get( new TableScanDesc(), parent.getSchema()); - List> childOpList = new ArrayList>(); + List> childOpList = + new ArrayList>(); childOpList.add(child); ts_op.setChildOperators(childOpList); child.replaceParent(parent, ts_op); @@ -199,8 +202,8 @@ Task uTask = uCtxTask.getUTask(); MapredWork plan = (MapredWork) uTask.getWork(); ctx.setCurrTask(uTask); - List> seenOps = ctx.getSeenOps(); - Operator topOp = ctx.getCurrTopOp(); + List> seenOps = ctx.getSeenOps(); + Operator topOp = ctx.getCurrTopOp(); if (!seenOps.contains(topOp) && topOp != null) { seenOps.add(topOp); GenMapRedUtils.setTaskPlan(ctx.getCurrAliasId(), ctx @@ -247,7 +250,7 @@ // Map-only subqueries can be optimized in future to not write to a file in // future - Map, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx(); + Map, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx(); UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union); @@ -305,7 +308,7 @@ ctx.setCurrTask(uTask); - mapCurrCtx.put((Operator) nd, + mapCurrCtx.put((Operator) nd, new GenMapRedCtx(ctx.getCurrTask(), null, null)); return null; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -67,6 +66,7 @@ import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Factory for generating the different node processors used by ColumnPruner. @@ -154,8 +154,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; - cppCtx.getPrunedColLists().put((Operator) nd, - cppCtx.genColLists((Operator) nd)); + cppCtx.getPrunedColLists().put((Operator) nd, + cppCtx.genColLists((Operator) nd)); return null; } @@ -180,8 +180,8 @@ TableScanOperator scanOp = (TableScanOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; List cols = cppCtx - .genColLists((Operator) nd); - cppCtx.getPrunedColLists().put((Operator) nd, + .genColLists((Operator) nd); + cppCtx.getPrunedColLists().put((Operator) nd, cols); ArrayList needed_columns = new ArrayList(); RowResolver inputRR = cppCtx.getOpToParseCtxMap().get(scanOp).getRowResolver(); @@ -241,13 +241,13 @@ Object... nodeOutputs) throws SemanticException { ReduceSinkOperator op = (ReduceSinkOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; - HashMap, OpParseContext> opToParseCtxMap = cppCtx + HashMap, OpParseContext> opToParseCtxMap = cppCtx .getOpToParseCtxMap(); RowResolver redSinkRR = opToParseCtxMap.get(op).getRowResolver(); ReduceSinkDesc conf = op.getConf(); - List> childOperators = op + List> childOperators = op .getChildOperators(); - List> parentOperators = op + List> parentOperators = op .getParentOperators(); List colLists = new ArrayList(); @@ -259,7 +259,7 @@ if ((childOperators.size() == 1) && (childOperators.get(0) instanceof JoinOperator)) { assert parentOperators.size() == 1; - Operator par = parentOperators.get(0); + Operator par = parentOperators.get(0); JoinOperator childJoin = (JoinOperator) childOperators.get(0); RowResolver parRR = opToParseCtxMap.get(par).getRowResolver(); List childJoinCols = cppCtx.getJoinPrunedColLists().get( @@ -405,7 +405,7 @@ LateralViewJoinOperator lvJoin = null; if (op.getChildOperators() != null) { - for (Operator child : op.getChildOperators()) { + for (Operator child : op.getChildOperators()) { // If one of my children is a FileSink or Script, return all columns. // Without this break, a bug in ReduceSink to Extract edge column // pruning will manifest @@ -490,14 +490,14 @@ */ private void handleChildren(SelectOperator op, List retainedSelOutputCols, ColumnPrunerProcCtx cppCtx) throws SemanticException { - for (Operator child : op.getChildOperators()) { + for (Operator child : op.getChildOperators()) { if (child instanceof ReduceSinkOperator) { boolean[] flags = getPruneReduceSinkOpRetainFlags( retainedSelOutputCols, (ReduceSinkOperator) child); pruneReduceSinkOperator(flags, (ReduceSinkOperator) child, cppCtx); } else if (child instanceof FilterOperator) { // filter operator has the same output columns as its parent - for (Operator filterChild : child + for (Operator filterChild : child .getChildOperators()) { if (filterChild instanceof ReduceSinkOperator) { boolean[] flags = getPruneReduceSinkOpRetainFlags( @@ -647,7 +647,7 @@ } private static void pruneOperator(NodeProcessorCtx ctx, - Operator op, + Operator op, List cols) throws SemanticException { // the pruning needs to preserve the order of columns in the input schema @@ -671,7 +671,7 @@ * @return * @throws SemanticException */ - private static List preserveColumnOrder(Operator op, + private static List preserveColumnOrder(Operator op, List cols) throws SemanticException { RowSchema inputSchema = op.getSchema(); @@ -696,10 +696,10 @@ Map> retainMap, boolean mapJoin) throws SemanticException { ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; Map> prunedColLists = new HashMap>(); - List> childOperators = op + List> childOperators = op .getChildOperators(); - for (Operator child : childOperators) { + for (Operator child : childOperators) { if (child instanceof FileSinkOperator) { return; } @@ -787,7 +787,7 @@ } - for (Operator child : childOperators) { + for (Operator child : childOperators) { if (child instanceof ReduceSinkOperator) { boolean[] flags = getPruneReduceSinkOpRetainFlags(childColLists, (ReduceSinkOperator) child); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink1.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink1.java (working copy) @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - table scan followed by reduce sink. @@ -43,7 +44,7 @@ /** * Reduce Scan encountered. - * + * * @param nd * the reduce sink operator encountered * @param opProcCtx @@ -54,15 +55,15 @@ ReduceSinkOperator op = (ReduceSinkOperator) nd; GenMRProcContext ctx = (GenMRProcContext) opProcCtx; - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2)); Task currTask = mapredCtx.getCurrTask(); MapredWork currPlan = (MapredWork) currTask.getWork(); - Operator currTopOp = mapredCtx.getCurrTopOp(); + Operator currTopOp = mapredCtx.getCurrTopOp(); String currAliasId = mapredCtx.getCurrAliasId(); - Operator reducer = op.getChildOperators().get(0); - HashMap, Task> opTaskMap = ctx + Operator reducer = op.getChildOperators().get(0); + HashMap, Task> opTaskMap = ctx .getOpTaskMap(); Task opMapTask = opTaskMap.get(reducer); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (working copy) @@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.MapredWork; @@ -69,7 +70,7 @@ import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; -import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * General utility common functions for the Processor to convert operator into @@ -92,14 +93,15 @@ */ public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException { - Operator reducer = op.getChildOperators().get(0); - Map, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); + Operator reducer = op.getChildOperators().get(0); + Map, GenMapRedCtx> mapCurrCtx = + opProcCtx.getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0)); Task currTask = mapredCtx.getCurrTask(); MapredWork plan = (MapredWork) currTask.getWork(); - HashMap, Task> opTaskMap = + HashMap, Task> opTaskMap = opProcCtx.getOpTaskMap(); - Operator currTopOp = opProcCtx.getCurrTopOp(); + Operator currTopOp = opProcCtx.getCurrTopOp(); opTaskMap.put(reducer, currTask); plan.setReducer(reducer); @@ -117,7 +119,7 @@ } assert currTopOp != null; - List> seenOps = opProcCtx.getSeenOps(); + List> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); if (!seenOps.contains(currTopOp)) { @@ -134,8 +136,9 @@ } public static void initMapJoinPlan( - Operator op, GenMRProcContext ctx, - boolean readInputMapJoin, boolean readInputUnion, boolean setReducer, int pos) throws SemanticException { + Operator op, GenMRProcContext ctx, + boolean readInputMapJoin, boolean readInputUnion, boolean setReducer, int pos) + throws SemanticException { initMapJoinPlan(op, ctx, readInputMapJoin, readInputUnion, setReducer, pos, false); } @@ -149,20 +152,21 @@ * @param pos * position of the parent */ - public static void initMapJoinPlan(Operator op, + public static void initMapJoinPlan(Operator op, GenMRProcContext opProcCtx, boolean readInputMapJoin, boolean readInputUnion, boolean setReducer, int pos, boolean createLocalPlan) throws SemanticException { - Map, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); + Map, GenMapRedCtx> mapCurrCtx = + opProcCtx.getMapCurrCtx(); assert (((pos == -1) && (readInputMapJoin)) || (pos != -1)); int parentPos = (pos == -1) ? 0 : pos; GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get( parentPos)); Task currTask = mapredCtx.getCurrTask(); MapredWork plan = (MapredWork) currTask.getWork(); - HashMap, Task> opTaskMap = + HashMap, Task> opTaskMap = opProcCtx.getOpTaskMap(); - Operator currTopOp = opProcCtx.getCurrTopOp(); + Operator currTopOp = opProcCtx.getCurrTopOp(); // The mapjoin has already been encountered. Some context must be stored // about that @@ -173,7 +177,7 @@ false : true; if (setReducer) { - Operator reducer = op.getChildOperators().get(0); + Operator reducer = op.getChildOperators().get(0); plan.setReducer(reducer); opTaskMap.put(reducer, currTask); if (reducer.getClass() == JoinOperator.class) { @@ -189,7 +193,7 @@ GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(currMapJoinOp); String taskTmpDir; TableDesc tt_desc; - Operator rootOp; + Operator rootOp; if (mjCtx.getOldMapJoin() == null || setReducer) { taskTmpDir = mjCtx.getTaskTmpDir(); @@ -222,7 +226,7 @@ } assert currTopOp != null; - List> seenOps = opProcCtx.getSeenOps(); + List> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); seenOps.add(currTopOp); @@ -249,7 +253,7 @@ } if (localPlan == null && createLocalPlan) { localPlan = new MapredLocalWork( - new LinkedHashMap>(), + new LinkedHashMap>(), new LinkedHashMap()); } } else { @@ -298,10 +302,10 @@ public static void initUnionPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx, Task unionTask) throws SemanticException { - Operator reducer = op.getChildOperators().get(0); + Operator reducer = op.getChildOperators().get(0); MapredWork plan = (MapredWork) unionTask.getWork(); - HashMap, Task> opTaskMap = + HashMap, Task> opTaskMap = opProcCtx.getOpTaskMap(); opTaskMap.put(reducer, unionTask); @@ -320,10 +324,10 @@ private static void setUnionPlan(GenMRProcContext opProcCtx, boolean local, MapredWork plan, GenMRUnionCtx uCtx, boolean mergeTask) throws SemanticException { - Operator currTopOp = opProcCtx.getCurrTopOp(); + Operator currTopOp = opProcCtx.getCurrTopOp(); if (currTopOp != null) { - List> seenOps = opProcCtx.getSeenOps(); + List> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); if (!seenOps.contains(currTopOp) || mergeTask) { seenOps.add(currTopOp); @@ -340,7 +344,7 @@ int size = taskTmpDirLst.size(); assert local == false; - List> topOperators = + List> topOperators = uCtx.getListTopOperators(); for (int pos = 0; pos < size; pos++) { @@ -422,7 +426,7 @@ opProcCtx.setCurrTask(existingTask); } - public static void joinPlan(Operator op, + public static void joinPlan(Operator op, Task oldTask, Task task, GenMRProcContext opProcCtx, int pos, boolean split, boolean readMapJoinData, boolean readUnionData) throws SemanticException { @@ -443,14 +447,14 @@ * @param pos * position of the parent in the stack */ - public static void joinPlan(Operator op, + public static void joinPlan(Operator op, Task oldTask, Task task, GenMRProcContext opProcCtx, int pos, boolean split, boolean readMapJoinData, boolean readUnionData, boolean createLocalWork) throws SemanticException { Task currTask = task; MapredWork plan = (MapredWork) currTask.getWork(); - Operator currTopOp = opProcCtx.getCurrTopOp(); + Operator currTopOp = opProcCtx.getCurrTopOp(); List> parTasks = null; // terminate the old task and make current task dependent on it @@ -471,7 +475,7 @@ } if (currTopOp != null) { - List> seenOps = opProcCtx.getSeenOps(); + List> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); if (!seenOps.contains(currTopOp)) { @@ -500,7 +504,7 @@ AbstractMapJoinOperator oldMapJoin = mjCtx.getOldMapJoin(); String taskTmpDir = null; TableDesc tt_desc = null; - Operator rootOp = null; + Operator rootOp = null; boolean local = ((pos == -1) || (pos == (mjOp.getConf()) .getPosBigTable())) ? false : true; @@ -552,7 +556,7 @@ MapredWork cplan = getMapRedWork(parseCtx); Task redTask = TaskFactory.get(cplan, parseCtx .getConf()); - Operator reducer = op.getChildOperators().get(0); + Operator reducer = op.getChildOperators().get(0); // Add the reducer cplan.setReducer(reducer); @@ -560,7 +564,7 @@ cplan.setNumReduceTasks(new Integer(desc.getNumReducers())); - HashMap, Task> opTaskMap = + HashMap, Task> opTaskMap = opProcCtx.getOpTaskMap(); opTaskMap.put(reducer, redTask); Task currTask = opProcCtx.getCurrTask(); @@ -584,7 +588,7 @@ * processing context */ public static void setTaskPlan(String alias_id, - Operator topOp, MapredWork plan, boolean local, + Operator topOp, MapredWork plan, boolean local, GenMRProcContext opProcCtx) throws SemanticException { setTaskPlan(alias_id, topOp, plan, local, opProcCtx, null); } @@ -606,7 +610,7 @@ * pruned partition list. If it is null it will be computed on-the-fly. */ public static void setTaskPlan(String alias_id, - Operator topOp, MapredWork plan, boolean local, + Operator topOp, MapredWork plan, boolean local, GenMRProcContext opProcCtx, PrunedPartitionList pList) throws SemanticException { ParseContext parseCtx = opProcCtx.getParseCtx(); Set inputs = opProcCtx.getInputs(); @@ -810,7 +814,7 @@ MapredLocalWork localPlan = plan.getMapLocalWork(); if (localPlan == null) { localPlan = new MapredLocalWork( - new LinkedHashMap>(), + new LinkedHashMap>(), new LinkedHashMap()); } @@ -845,7 +849,7 @@ * table descriptor */ public static void setTaskPlan(String path, String alias, - Operator topOp, MapredWork plan, boolean local, + Operator topOp, MapredWork plan, boolean local, TableDesc tt_desc) throws SemanticException { if(path == null || alias == null) { @@ -864,7 +868,7 @@ MapredLocalWork localPlan = plan.getMapLocalWork(); if (localPlan == null) { localPlan = new MapredLocalWork( - new LinkedHashMap>(), + new LinkedHashMap>(), new LinkedHashMap()); } @@ -885,7 +889,7 @@ * current top operator in the path */ public static void setKeyAndValueDesc(MapredWork plan, - Operator topOp) { + Operator topOp) { if (topOp == null) { return; } @@ -900,9 +904,9 @@ } tagToSchema.set(tag, rs.getConf().getValueSerializeInfo()); } else { - List> children = topOp.getChildOperators(); + List> children = topOp.getChildOperators(); if (children != null) { - for (Operator op : children) { + for (Operator op : children) { setKeyAndValueDesc(plan, op); } } @@ -935,7 +939,7 @@ work.setMapperCannotSpanPartns(mapperCannotSpanPartns); work.setPathToAliases(new LinkedHashMap>()); work.setPathToPartitionInfo(new LinkedHashMap()); - work.setAliasToWork(new LinkedHashMap>()); + work.setAliasToWork(new LinkedHashMap>()); work.setTagToValueDesc(new ArrayList()); work.setReducer(null); work.setHadoopSupportsSplittable( @@ -954,8 +958,8 @@ * parse context */ @SuppressWarnings("nls") - private static Operator putOpInsertMap( - Operator op, RowResolver rr, ParseContext parseCtx) { + public static Operator putOpInsertMap( + Operator op, RowResolver rr, ParseContext parseCtx) { OpParseContext ctx = new OpParseContext(rr); parseCtx.getOpParseCtx().put(op, ctx); return op; @@ -971,12 +975,12 @@ * @param setReducer does the reducer needs to be set * @param pos position of the parent **/ - public static void splitTasks(Operator op, + public static void splitTasks(Operator op, Task parentTask, Task childTask, GenMRProcContext opProcCtx, boolean setReducer, boolean local, int posn) throws SemanticException { childTask.getWork(); - Operator currTopOp = opProcCtx.getCurrTopOp(); + Operator currTopOp = opProcCtx.getCurrTopOp(); ParseContext parseCtx = opProcCtx.getParseCtx(); parentTask.addDependentTask(childTask); @@ -992,7 +996,7 @@ Context baseCtx = parseCtx.getContext(); String taskTmpDir = baseCtx.getMRTmpFileURI(); - Operator parent = op.getParentOperators().get(posn); + Operator parent = op.getParentOperators().get(posn); TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils .getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol")); @@ -1007,11 +1011,11 @@ desc.setCompressType(parseCtx.getConf().getVar( HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE)); } - Operator fs_op = putOpInsertMap(OperatorFactory + Operator fs_op = putOpInsertMap(OperatorFactory .get(desc, parent.getSchema()), null, parseCtx); // replace the reduce child with this operator - List> childOpList = parent + List> childOpList = parent .getChildOperators(); for (int pos = 0; pos < childOpList.size(); pos++) { if (childOpList.get(pos) == op) { @@ -1020,30 +1024,31 @@ } } - List> parentOpList = - new ArrayList>(); + List> parentOpList = + new ArrayList>(); parentOpList.add(parent); fs_op.setParentOperators(parentOpList); // create a dummy tableScan operator on top of op // TableScanOperator is implicitly created here for each MapOperator RowResolver rowResolver = opProcCtx.getParseCtx().getOpParseCtx().get(parent).getRowResolver(); - Operator ts_op = putOpInsertMap(OperatorFactory + Operator ts_op = putOpInsertMap(OperatorFactory .get(TableScanDesc.class, parent.getSchema()), rowResolver, parseCtx); - childOpList = new ArrayList>(); + childOpList = new ArrayList>(); childOpList.add(op); ts_op.setChildOperators(childOpList); op.getParentOperators().set(posn, ts_op); - Map, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); + Map, GenMapRedCtx> mapCurrCtx = + opProcCtx.getMapCurrCtx(); mapCurrCtx.put(ts_op, new GenMapRedCtx(childTask, null, null)); String streamDesc = taskTmpDir; MapredWork cplan = (MapredWork) childTask.getWork(); if (setReducer) { - Operator reducer = op.getChildOperators().get(0); + Operator reducer = op.getChildOperators().get(0); if (reducer.getClass() == JoinOperator.class) { String origStreamDesc; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMROperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMROperator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMROperator.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.Map; import java.util.Stack; @@ -28,6 +27,7 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - no specific rule fired. @@ -39,7 +39,7 @@ /** * Reduce Scan encountered. - * + * * @param nd * the reduce sink operator encountered * @param procCtx @@ -49,10 +49,10 @@ Object... nodeOutputs) throws SemanticException { GenMRProcContext ctx = (GenMRProcContext) procCtx; - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2)); - mapCurrCtx.put((Operator) nd, new GenMapRedCtx( + mapCurrCtx.put((Operator) nd, new GenMapRedCtx( mapredCtx.getCurrTask(), mapredCtx.getCurrTopOp(), mapredCtx .getCurrAliasId())); return null; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java (working copy) @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Operator factory for MapJoin processing. @@ -57,9 +58,9 @@ int pos = 0; int size = stack.size(); assert size >= 2 && stack.get(size - 1) == op; - Operator parent = (Operator) stack - .get(size - 2); - List> parOp = op.getParentOperators(); + Operator parent = + (Operator) stack.get(size - 2); + List> parOp = op.getParentOperators(); pos = parOp.indexOf(parent); assert pos < parOp.size(); return pos; @@ -72,24 +73,24 @@ @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { + Object... nodeOutputs) throws SemanticException { AbstractMapJoinOperator mapJoin = (AbstractMapJoinOperator) nd; GenMRProcContext ctx = (GenMRProcContext) procCtx; // find the branch on which this processor was invoked int pos = getPositionParent(mapJoin, stack); - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get( pos)); Task currTask = mapredCtx.getCurrTask(); MapredWork currPlan = (MapredWork) currTask.getWork(); - Operator currTopOp = mapredCtx.getCurrTopOp(); + Operator currTopOp = mapredCtx.getCurrTopOp(); String currAliasId = mapredCtx.getCurrAliasId(); - Operator reducer = mapJoin; - HashMap, Task> opTaskMap = ctx - .getOpTaskMap(); + Operator reducer = mapJoin; + HashMap, Task> opTaskMap = + ctx.getOpTaskMap(); Task opMapTask = opTaskMap.get(reducer); ctx.setCurrTopOp(currTopOp); @@ -138,11 +139,11 @@ : true; GenMapRedUtils.splitTasks(mapJoin, currTask, redTask, opProcCtx, false, - local, pos); + local, pos); currTask = opProcCtx.getCurrTask(); - HashMap, Task> opTaskMap = opProcCtx - .getOpTaskMap(); + HashMap, Task> opTaskMap = + opProcCtx.getOpTaskMap(); Task opMapTask = opTaskMap.get(mapJoin); // If the plan for this reducer does not exist, initialize the plan @@ -195,9 +196,9 @@ if (listMapJoinOps.contains(mapJoin)) { ctx.setCurrAliasId(null); ctx.setCurrTopOp(null); - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); - mapCurrCtx.put((Operator) nd, new GenMapRedCtx( + mapCurrCtx.put((Operator) nd, new GenMapRedCtx( ctx.getCurrTask(), null, null)); return null; } @@ -230,14 +231,15 @@ sel.setParentOperators(null); // Create a file sink operator for this file name - Operator fs_op = OperatorFactory.get( + Operator fs_op = OperatorFactory.get( new FileSinkDesc(taskTmpDir, tt_desc, parseCtx.getConf().getBoolVar( HiveConf.ConfVars.COMPRESSINTERMEDIATE)), mapJoin.getSchema()); assert mapJoin.getChildOperators().size() == 1; mapJoin.getChildOperators().set(0, fs_op); - List> parentOpList = new ArrayList>(); + List> parentOpList = + new ArrayList>(); parentOpList.add(mapJoin); fs_op.setParentOperators(parentOpList); @@ -247,9 +249,9 @@ ctx.setCurrAliasId(null); ctx.setCurrTopOp(null); - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); - mapCurrCtx.put((Operator) nd, new GenMapRedCtx( + mapCurrCtx.put((Operator) nd, new GenMapRedCtx( ctx.getCurrTask(), null, null)); return null; @@ -263,8 +265,9 @@ @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - AbstractMapJoinOperator mapJoin = (AbstractMapJoinOperator) nd; + Object... nodeOutputs) throws SemanticException { + AbstractMapJoinOperator mapJoin = + (AbstractMapJoinOperator) nd; GenMRProcContext ctx = (GenMRProcContext) procCtx; ctx.getParseCtx(); @@ -282,16 +285,16 @@ // find the branch on which this processor was invoked int pos = getPositionParent(mapJoin, stack); - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get( pos)); Task currTask = mapredCtx.getCurrTask(); MapredWork currPlan = (MapredWork) currTask.getWork(); mapredCtx.getCurrAliasId(); - Operator reducer = mapJoin; - HashMap, Task> opTaskMap = ctx - .getOpTaskMap(); + Operator reducer = mapJoin; + HashMap, Task> opTaskMap = + ctx.getOpTaskMap(); Task opMapTask = opTaskMap.get(reducer); ctx.setCurrTask(currTask); @@ -321,7 +324,7 @@ @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { + Object... nodeOutputs) throws SemanticException { GenMRProcContext ctx = (GenMRProcContext) procCtx; ParseContext parseCtx = ctx.getParseCtx(); @@ -341,15 +344,15 @@ // find the branch on which this processor was invoked int pos = getPositionParent(mapJoin, stack); - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get( pos)); Task currTask = mapredCtx.getCurrTask(); MapredWork currPlan = (MapredWork) currTask.getWork(); - Operator reducer = mapJoin; - HashMap, Task> opTaskMap = ctx - .getOpTaskMap(); + Operator reducer = mapJoin; + HashMap, Task> opTaskMap = + ctx.getOpTaskMap(); Task opMapTask = opTaskMap.get(reducer); // union result cannot be a map table Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MapJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MapJoinResolver.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MapJoinResolver.java (working copy) @@ -47,12 +47,14 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ConditionalResolver; import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; +import + org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; import org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx; import org.apache.hadoop.hive.ql.plan.ConditionalWork; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.MapredWork; -import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; -import org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * An implementation of PhysicalPlanResolver. It iterator each MapRedTask to see whether the task @@ -122,7 +124,8 @@ // replace the map join operator to local_map_join operator in the operator tree // and return all the dummy parent LocalMapJoinProcCtx localMapJoinProcCtx= adjustLocalTask(localTask); - List> dummyOps = localMapJoinProcCtx.getDummyParentOp(); + List> dummyOps = + localMapJoinProcCtx.getDummyParentOp(); // create new local work and setup the dummy ops MapredLocalWork newLocalWork = new MapredLocalWork(); @@ -264,13 +267,13 @@ public static class LocalMapJoinProcCtx implements NodeProcessorCtx { private Task currentTask; private ParseContext parseCtx; - private List> dummyParentOp = null; + private List> dummyParentOp = null; private boolean isFollowedByGroupBy; public LocalMapJoinProcCtx(Task task, ParseContext parseCtx) { currentTask = task; this.parseCtx = parseCtx; - dummyParentOp = new ArrayList>(); + dummyParentOp = new ArrayList>(); isFollowedByGroupBy = false; } @@ -297,15 +300,15 @@ this.parseCtx = parseCtx; } - public void setDummyParentOp(List> op) { + public void setDummyParentOp(List> op) { this.dummyParentOp = op; } - public List> getDummyParentOp() { + public List> getDummyParentOp() { return this.dummyParentOp; } - public void addDummyParentOp(Operator op) { + public void addDummyParentOp(Operator op) { this.dummyParentOp.add(op); } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/GenMRSkewJoinProcessor.java (working copy) @@ -56,6 +56,7 @@ import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; @@ -250,14 +251,14 @@ throw new SemanticException(e); } - Operator[] parentOps = new TableScanOperator[tags.length]; + Operator[] parentOps = new TableScanOperator[tags.length]; for (int k = 0; k < tags.length; k++) { - Operator ts = OperatorFactory.get( + Operator ts = OperatorFactory.get( TableScanDesc.class, (RowSchema) null); ((TableScanOperator)ts).setTableDesc(tableDescList.get((byte)k)); parentOps[k] = ts; } - Operator tblScan_op = parentOps[i]; + Operator tblScan_op = parentOps[i]; ArrayList aliases = new ArrayList(); String alias = src.toString(); @@ -275,7 +276,7 @@ newPlan.getPathToPartitionInfo().put(bigKeyDirPath, part); newPlan.getAliasToPartnInfo().put(alias, part); - Operator reducer = clonePlan.getReducer(); + Operator reducer = clonePlan.getReducer(); assert reducer instanceof JoinOperator; JoinOperator cloneJoinOp = (JoinOperator) reducer; @@ -289,7 +290,7 @@ mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes()); MapredLocalWork localPlan = new MapredLocalWork( - new LinkedHashMap>(), + new LinkedHashMap>(), new LinkedHashMap()); Map smallTblDirs = smallKeysDirMap.get(src); @@ -298,7 +299,7 @@ continue; } Byte small_alias = tags[j]; - Operator tblScan_op2 = parentOps[j]; + Operator tblScan_op2 = parentOps[j]; localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2); Path tblDir = new Path(smallTblDirs.get(small_alias)); localPlan.getAliasToFetchWork().put(small_alias.toString(), @@ -312,9 +313,9 @@ .getAndMakeChild(mapJoinDescriptor, (RowSchema) null, parentOps); // change the children of the original join operator to point to the map // join operator - List> childOps = cloneJoinOp + List> childOps = cloneJoinOp .getChildOperators(); - for (Operator childOp : childOps) { + for (Operator childOp : childOps) { childOp.replaceParent(cloneJoinOp, mapJoinOp); } mapJoinOp.setChildOperators(childOps); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/LocalMapJoinProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/LocalMapJoinProcFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/LocalMapJoinProcFactory.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.physical; -import java.io.Serializable; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -49,6 +48,7 @@ import org.apache.hadoop.hive.ql.plan.HashTableSinkDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Node processor factory for map join resolver. What it did is to replace the @@ -136,16 +136,18 @@ int bigTableAlias = (int) order[bigTable]; // the parent ops for hashTableSinkOp - List> smallTablesParentOp = new ArrayList>(); - List> dummyOperators = new ArrayList>(); + List> smallTablesParentOp = + new ArrayList>(); + List> dummyOperators = + new ArrayList>(); // get all parents - List> parentsOp = mapJoinOp.getParentOperators(); + List> parentsOp = mapJoinOp.getParentOperators(); for (int i = 0; i < parentsOp.size(); i++) { if (i == bigTableAlias) { smallTablesParentOp.add(null); continue; } - Operator parent = parentsOp.get(i); + Operator parent = parentsOp.get(i); // let hashtable Op be the child of this parent parent.replaceChild(mapJoinOp, hashTableSinkOp); // keep the parent id correct @@ -171,22 +173,23 @@ dummyOp.getConf().setTbl(tbl); // let the dummy op be the parent of mapjoin op mapJoinOp.replaceParent(parent, dummyOp); - List> dummyChildren = new ArrayList>(); + List> dummyChildren = + new ArrayList>(); dummyChildren.add(mapJoinOp); dummyOp.setChildOperators(dummyChildren); // add this dummy op to the dummp operator list dummyOperators.add(dummyOp); } hashTableSinkOp.setParentOperators(smallTablesParentOp); - for (Operator op : dummyOperators) { + for (Operator op : dummyOperators) { context.addDummyParentOp(op); } return null; } - public void hasGroupBy(Operator mapJoinOp, + public void hasGroupBy(Operator mapJoinOp, LocalMapJoinProcCtx localMapJoinProcCtx) throws Exception { - List> childOps = mapJoinOp.getChildOperators(); + List> childOps = mapJoinOp.getChildOperators(); Map opRules = new LinkedHashMap(); opRules.put(new RuleRegExp("R1", "GBY%"), LocalMapJoinProcFactory.getGroupByProc()); // The dispatcher fires the processor corresponding to the closest Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MetadataOnlyOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MetadataOnlyOptimizer.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MetadataOnlyOptimizer.java (working copy) @@ -21,11 +21,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Stack; -import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -35,6 +35,7 @@ import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -49,7 +50,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; -import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.NullStructSerDe; /** @@ -179,7 +180,7 @@ */ class MetadataOnlyTaskDispatcher implements Dispatcher { - private PhysicalContext physicalContext; + private final PhysicalContext physicalContext; public MetadataOnlyTaskDispatcher(PhysicalContext context) { super(); @@ -189,7 +190,8 @@ private String getAliasForTableScanOperator(MapredWork work, TableScanOperator tso) { - for (Map.Entry> entry : work.getAliasToWork().entrySet()) { + for (Map.Entry> entry : + work.getAliasToWork().entrySet()) { if (entry.getValue() == tso) { return entry.getKey(); } @@ -250,7 +252,7 @@ throws SemanticException { Task task = (Task) nd; - Collection> topOperators + Collection> topOperators = task.getTopOperators(); if (topOperators.size() == 0) { return null; @@ -273,7 +275,7 @@ // Create a list of topOp nodes ArrayList topNodes = new ArrayList(); // Get the top Nodes for this map-reduce task - for (Operator + for (Operator workOperator : topOperators) { if (parseContext.getTopOps().values().contains(workOperator)) { topNodes.add(workOperator); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; @@ -58,6 +57,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; //try to replace a bucket map join with a sorted merge map join public class SortedMergeBucketMapJoinOptimizer implements Transform { @@ -234,7 +234,7 @@ List sortColumnsFirstTable) throws SemanticException { - Map> topOps = this.pGraphContext + Map> topOps = this.pGraphContext .getTopOps(); Map topToTable = this.pGraphContext .getTopToTable(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkDeDuplication.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkDeDuplication.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkDeDuplication.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -57,6 +56,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * If two reducer sink operators share the same partition/sort columns, we @@ -155,9 +155,10 @@ return null; } - List> childOp = childReduceSink.getChildOperators(); + List> childOp = + childReduceSink.getChildOperators(); if (childOp != null && childOp.size() == 1) { - Operator child = childOp.get(0); + Operator child = childOp.get(0); if (child instanceof GroupByOperator || child instanceof JoinOperator) { ctx.addRejectedReduceSinkOperator(childReduceSink); return null; @@ -165,7 +166,8 @@ } ParseContext pGraphContext = ctx.getPctx(); - HashMap childColumnMapping = getPartitionAndKeyColumnMapping(childReduceSink); + HashMap childColumnMapping = + getPartitionAndKeyColumnMapping(childReduceSink); ReduceSinkOperator parentRS = null; parentRS = findSingleParentReduceSink(childReduceSink, pGraphContext); if (parentRS == null) { @@ -173,7 +175,7 @@ return null; } HashMap parentColumnMapping = getPartitionAndKeyColumnMapping(parentRS); - Operator stopBacktrackFlagOp = null; + Operator stopBacktrackFlagOp = null; if (parentRS.getParentOperators() == null || parentRS.getParentOperators().size() == 0) { stopBacktrackFlagOp = parentRS; @@ -202,10 +204,12 @@ private void replaceReduceSinkWithSelectOperator( ReduceSinkOperator childReduceSink, ParseContext pGraphContext) throws SemanticException { - List> parentOp = childReduceSink.getParentOperators(); - List> childOp = childReduceSink.getChildOperators(); + List> parentOp = + childReduceSink.getParentOperators(); + List> childOp = + childReduceSink.getChildOperators(); - Operator oldParent = childReduceSink; + Operator oldParent = childReduceSink; if (childOp != null && childOp.size() == 1 && ((childOp.get(0)) instanceof ExtractOperator)) { @@ -213,7 +217,7 @@ childOp = childOp.get(0).getChildOperators(); } - Operator input = parentOp.get(0); + Operator input = parentOp.get(0); input.getChildOperators().clear(); RowResolver inputRR = pGraphContext.getOpParseCtx().get(input).getRowResolver(); @@ -247,14 +251,14 @@ // Insert the select operator in between. sel.setChildOperators(childOp); - for (Operator ch : childOp) { + for (Operator ch : childOp) { ch.replaceParent(oldParent, sel); } } - private Operator putOpInsertMap( - Operator op, RowResolver rr, ParseContext pGraphContext) { + private Operator putOpInsertMap( + Operator op, RowResolver rr, ParseContext pGraphContext) { OpParseContext ctx = new OpParseContext(rr); pGraphContext.getOpParseCtx().put(op, ctx); return op; @@ -373,8 +377,9 @@ private boolean backTrackColumnNames( HashMap columnMapping, ReduceSinkOperator reduceSink, - Operator stopBacktrackFlagOp, ParseContext pGraphContext) { - Operator startOperator = reduceSink; + Operator stopBacktrackFlagOp, + ParseContext pGraphContext) { + Operator startOperator = reduceSink; while (startOperator != null && startOperator != stopBacktrackFlagOp) { startOperator = startOperator.getParentOperators().get(0); Map colExprMap = startOperator.getColumnExprMap(); @@ -423,7 +428,7 @@ } private ReduceSinkOperator findSingleParentReduceSink(ReduceSinkOperator childReduceSink, ParseContext pGraphContext) { - Operator start = childReduceSink; + Operator start = childReduceSink; while(start != null) { if (start.getParentOperators() == null || start.getParentOperators().size() != 1) { Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink2.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink2.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink2.java (working copy) @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - reduce sink followed by reduce sink. @@ -41,7 +42,7 @@ /** * Reduce Scan encountered. - * + * * @param nd * the reduce sink operator encountered * @param opProcCtx @@ -52,14 +53,14 @@ ReduceSinkOperator op = (ReduceSinkOperator) nd; GenMRProcContext ctx = (GenMRProcContext) opProcCtx; - Map, GenMapRedCtx> mapCurrCtx = ctx + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0)); Task currTask = mapredCtx.getCurrTask(); - Operator currTopOp = mapredCtx.getCurrTopOp(); + Operator currTopOp = mapredCtx.getCurrTopOp(); String currAliasId = mapredCtx.getCurrAliasId(); - Operator reducer = op.getChildOperators().get(0); - Map, Task> opTaskMap = ctx + Operator reducer = op.getChildOperators().get(0); + Map, Task> opTaskMap = ctx .getOpTaskMap(); Task opMapTask = opTaskMap.get(reducer); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (working copy) @@ -75,6 +75,7 @@ import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.mapred.InputFormat; @@ -213,7 +214,7 @@ } // create a dummy tableScan operator - Operator tsMerge = OperatorFactory.get( + Operator tsMerge = OperatorFactory.get( TableScanDesc.class, inputRS); ArrayList outputColumns = new ArrayList(); @@ -335,7 +336,8 @@ // Create a TableScan operator RowSchema inputRS = fsInput.getSchema(); - Operator tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS); + Operator tsMerge = + OperatorFactory.get(TableScanDesc.class, inputRS); // Create a FileSink operator TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone(); @@ -510,7 +512,7 @@ * @param parentFS the last FileSinkOperator in the parent MapReduce work * @return the MapredWork */ - private MapredWork createMergeTask(HiveConf conf, Operator topOp, + private MapredWork createMergeTask(HiveConf conf, Operator topOp, FileSinkDesc fsDesc) { ArrayList aliases = new ArrayList(); @@ -556,7 +558,7 @@ work.setMapperCannotSpanPartns(true); work.setPathToAliases(pathToAliases); work.setAliasToWork( - new LinkedHashMap>()); + new LinkedHashMap>()); if (hasDynamicPartitions) { work.getPathToPartitionInfo().put(inputDir, new PartitionDesc(tblDesc, null)); @@ -696,11 +698,11 @@ mvTask = findMoveTask(ctx.getMvTask(), fsOp); } - Operator currTopOp = ctx.getCurrTopOp(); + Operator currTopOp = ctx.getCurrTopOp(); String currAliasId = ctx.getCurrAliasId(); - HashMap, Task> opTaskMap = + HashMap, Task> opTaskMap = ctx.getOpTaskMap(); - List> seenOps = ctx.getSeenOps(); + List> seenOps = ctx.getSeenOps(); List> rootTasks = ctx.getRootTasks(); // Set the move task to be dependent on the current task Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; import java.io.IOException; -import java.io.Serializable; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; @@ -66,6 +65,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** *this transformation does bucket map join optimization. @@ -188,7 +188,8 @@ LinkedHashMap>> aliasToPartitionBucketFileNamesMapping = new LinkedHashMap>>(); - Map> topOps = this.pGraphContext.getTopOps(); + Map> topOps = + this.pGraphContext.getTopOps(); Map topToTable = this.pGraphContext.getTopToTable(); // (partition to bucket file names) and (partition to bucket number) for Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SimpleFetchOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SimpleFetchOptimizer.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SimpleFetchOptimizer.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -53,6 +52,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Tries to convert simple fetch query to single fetch task, which fetches rows directly @@ -63,7 +63,7 @@ private final Log LOG = LogFactory.getLog(SimpleFetchOptimizer.class.getName()); public ParseContext transform(ParseContext pctx) throws SemanticException { - Map> topOps = pctx.getTopOps(); + Map> topOps = pctx.getTopOps(); if (pctx.getQB().isSimpleSelectQuery() && topOps.size() == 1) { // no join, no groupby, no distinct, no lateral view, no subq, // no CTAS or insert, not analyze command, and single sourced. @@ -234,8 +234,8 @@ pctx.getSemanticInputs().addAll(inputs); ListSinkOperator sink = new ListSinkOperator(); sink.setConf(new ListSinkDesc(work.getSerializationNullFormat())); - sink.setParentOperators(new ArrayList>()); - Operator parent = fileSink.getParentOperators().get(0); + sink.setParentOperators(new ArrayList>()); + Operator parent = fileSink.getParentOperators().get(0); sink.getParentOperators().add(parent); parent.replaceChild(fileSink, sink); fileSink.setParentOperators(null); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink3.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink3.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRRedSink3.java (working copy) @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Processor for the rule - union followed by reduce sink. @@ -56,8 +57,8 @@ // union consisted on a bunch of map-reduce jobs, and it has been split at // the union - Operator reducer = op.getChildOperators().get(0); - Map, GenMapRedCtx> mapCurrCtx = ctx + Operator reducer = op.getChildOperators().get(0); + Map, GenMapRedCtx> mapCurrCtx = ctx .getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(ctx.getCurrUnionOp()); @@ -70,7 +71,7 @@ MapredWork plan = (MapredWork) unionTask.getWork(); - HashMap, Task> opTaskMap = ctx + HashMap, Task> opTaskMap = ctx .getOpTaskMap(); Task reducerTask = opTaskMap.get(reducer); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyProcFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyProcFactory.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.index; -import java.io.Serializable; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -43,6 +42,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Factory of methods used by {@link RewriteGBUsingIndex} @@ -204,8 +204,8 @@ SelectOperator operator = (SelectOperator)nd; canApplyCtx = (RewriteCanApplyCtx)ctx; - List> childrenList = operator.getChildOperators(); - Operator child = childrenList.get(0); + List> childrenList = operator.getChildOperators(); + Operator child = childrenList.get(0); if(child instanceof FileSinkOperator){ Map internalToAlias = new LinkedHashMap(); RowSchema rs = operator.getSchema(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteGBUsingIndex.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteGBUsingIndex.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteGBUsingIndex.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.index; -import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -48,6 +47,7 @@ import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** @@ -224,7 +224,7 @@ boolean canApply = false; //Context for checking if this optimization can be applied to the input query RewriteCanApplyCtx canApplyCtx = RewriteCanApplyCtx.getInstance(parseContext); - Map> topOps = parseContext.getTopOps(); + Map> topOps = parseContext.getTopOps(); canApplyCtx.setBaseTableName(baseTableName); canApplyCtx.populateRewriteVars(topOp); @@ -403,8 +403,8 @@ */ @SuppressWarnings("unchecked") private void rewriteOriginalQuery() throws SemanticException { - Map> topOpMap = - (HashMap>) parseContext.getTopOps().clone(); + Map> topOpMap = + (HashMap>) parseContext.getTopOps().clone(); Iterator tsOpItr = tsOpToProcess.keySet().iterator(); while(tsOpItr.hasNext()){ @@ -416,7 +416,7 @@ indexTableName, baseTableName, canApplyCtx.getAggFunction()); rewriteQueryCtx.invokeRewriteQueryProc(topOp); parseContext = rewriteQueryCtx.getParseContext(); - parseContext.setOpParseCtx((LinkedHashMap, + parseContext.setOpParseCtx((LinkedHashMap, OpParseContext>) rewriteQueryCtx.getOpc()); } LOG.info("Finished Rewriting query"); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.index; -import java.io.Serializable; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -41,6 +40,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * RewriteQueryUsingAggregateIndexCtx class stores the @@ -67,8 +67,8 @@ } - private Map, OpParseContext> opc = - new LinkedHashMap, OpParseContext>(); + private Map, OpParseContext> opc = + new LinkedHashMap, OpParseContext>(); private final Hive hiveDb; private final ParseContext parseContext; //We need the GenericUDAFEvaluator for GenericUDAF function "sum" @@ -78,7 +78,7 @@ private final String aggregateFunction; private ExprNodeColumnDesc aggrExprNode = null; - public Map, OpParseContext> getOpc() { + public Map, OpParseContext> getOpc() { return opc; } @@ -119,7 +119,7 @@ * @throws SemanticException */ public void invokeRewriteQueryProc( - Operator topOp) throws SemanticException{ + Operator topOp) throws SemanticException{ Map opRules = new LinkedHashMap(); // replace scan operator containing original table with index table Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndex.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndex.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndex.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.index; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -53,12 +52,12 @@ import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -80,8 +79,8 @@ Object... nodeOutputs) throws SemanticException { SelectOperator operator = (SelectOperator)nd; rewriteQueryCtx = (RewriteQueryUsingAggregateIndexCtx)ctx; - List> childOps = operator.getChildOperators(); - Operator childOp = childOps.iterator().next(); + List> childOps = operator.getChildOperators(); + Operator childOp = childOps.iterator().next(); //we need to set the colList, outputColumnNames, colExprMap, // rowSchema for only that SelectOperator which precedes the GroupByOperator @@ -136,9 +135,9 @@ // and add new ones Map topToTable = rewriteQueryCtx.getParseContext().getTopToTable(); - Map> topOps = + Map> topOps = rewriteQueryCtx.getParseContext().getTopOps(); - Map, OpParseContext> opParseContext = + Map, OpParseContext> opParseContext = rewriteQueryCtx.getParseContext().getOpParseCtx(); //need this to set rowResolver for new scanOperator @@ -202,11 +201,11 @@ topOps.put(tabNameWithAlias, scanOperator); opParseContext.put(scanOperator, operatorContext); rewriteQueryCtx.getParseContext().setTopToTable( - (HashMap) topToTable); + (HashMap) topToTable); rewriteQueryCtx.getParseContext().setTopOps( - (HashMap>) topOps); + (HashMap>) topOps); rewriteQueryCtx.getParseContext().setOpParseCtx( - (LinkedHashMap, OpParseContext>) opParseContext); + (LinkedHashMap, OpParseContext>) opParseContext); return null; } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; @@ -39,6 +38,7 @@ import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Implementation of one of the rule-based optimization steps. ColumnPruner gets @@ -50,7 +50,7 @@ */ public class ColumnPruner implements Transform { protected ParseContext pGraphContext; - private HashMap, OpParseContext> opToParseCtxMap; + private HashMap, OpParseContext> opToParseCtxMap; /** * empty constructor. Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcCtx.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -34,21 +33,22 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * This class implements the processor context for Column Pruner. */ public class ColumnPrunerProcCtx implements NodeProcessorCtx { - private final Map, List> prunedColLists; + private final Map, List> prunedColLists; - private final HashMap, OpParseContext> opToParseCtxMap; + private final HashMap, OpParseContext> opToParseCtxMap; private final Map>> joinPrunedColLists; public ColumnPrunerProcCtx( - HashMap, OpParseContext> opToParseContextMap) { - prunedColLists = new HashMap, List>(); + HashMap, OpParseContext> opToParseContextMap) { + prunedColLists = new HashMap, List>(); opToParseCtxMap = opToParseContextMap; joinPrunedColLists = new HashMap>>(); } @@ -60,15 +60,15 @@ /** * @return the prunedColLists */ - public List getPrunedColList(Operator op) { + public List getPrunedColList(Operator op) { return prunedColLists.get(op); } - public HashMap, OpParseContext> getOpToParseCtxMap() { + public HashMap, OpParseContext> getOpToParseCtxMap() { return opToParseCtxMap; } - public Map, List> getPrunedColLists() { + public Map, List> getPrunedColLists() { return prunedColLists; } @@ -77,17 +77,17 @@ * RowResolver and are different from the external column names) that are * needed in the subtree. These columns eventually have to be selected from * the table scan. - * + * * @param curOp * The root of the operator subtree. * @return List of the internal column names. * @throws SemanticException */ - public List genColLists(Operator curOp) + public List genColLists(Operator curOp) throws SemanticException { List colList = new ArrayList(); if (curOp.getChildOperators() != null) { - for (Operator child : curOp.getChildOperators()) { + for (Operator child : curOp.getChildOperators()) { if (child instanceof CommonJoinOperator) { int tag = child.getParentOperators().indexOf(curOp); List prunList = joinPrunedColLists.get(child).get((byte) tag); @@ -105,7 +105,7 @@ * Creates the list of internal column names from select expressions in a * select operator. This function is used for the select operator instead of * the genColLists function (which is used by the rest of the operators). - * + * * @param op * The select operator. * @return List of the internal column names. @@ -122,7 +122,7 @@ /** * Creates the list of internal column names for select * expressions. - * + * * @param op * The select operator. * @param colList Index: ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java (working copy) @@ -543,7 +543,7 @@ tTable.getSd().getSkewedInfo().setSkewedColNames(skewedColNames); } - public List getSkewedColName() { + public List getSkewedColNames() { return tTable.getSd().getSkewedInfo().getSkewedColNames(); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java (working copy) @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -58,7 +59,7 @@ * different from regular operators in that it starts off by processing a * Writable data structure from a Table (instead of a Hive Object). **/ -public class MapOperator extends Operator implements Serializable { +public class MapOperator extends Operator implements Serializable, Cloneable { private static final long serialVersionUID = 1L; @@ -83,17 +84,17 @@ private Map opCtxMap; private final Set listInputPaths = new HashSet(); - private Map, java.util.ArrayList> operatorToPaths; + private Map, ArrayList> operatorToPaths; - private final Map, MapOpCtx> childrenOpToOpCtxMap = - new HashMap, MapOpCtx>(); + private final Map, MapOpCtx> childrenOpToOpCtxMap = + new HashMap, MapOpCtx>(); - private ArrayList> extraChildrenToClose = null; + private ArrayList> extraChildrenToClose = null; private static class MapInputPath { String path; String alias; - Operator op; + Operator op; /** * @param path @@ -101,7 +102,7 @@ * @param op */ public MapInputPath(String path, String alias, - Operator op) { + Operator op) { this.path = path; this.alias = alias; this.op = op; @@ -129,11 +130,11 @@ return ret; } - public Operator getOp() { + public Operator getOp() { return op; } - public void setOp(Operator op) { + public void setOp(Operator op) { this.op = op; } @@ -304,7 +305,7 @@ * need to be changed if the input changes **/ private void setInspectorInput(MapInputPath inp) { - Operator op = inp.getOp(); + Operator op = inp.getOp(); deserializer = opCtxMap.get(inp).getDeserializer(); isPartitioned = opCtxMap.get(inp).isPartitioned(); @@ -367,9 +368,10 @@ Path fpath = new Path((new Path(HiveConf.getVar(hconf, HiveConf.ConfVars.HADOOPMAPFILENAME))).toUri().getPath()); - ArrayList> children = new ArrayList>(); + ArrayList> children = + new ArrayList>(); opCtxMap = new HashMap(); - operatorToPaths = new HashMap, java.util.ArrayList>(); + operatorToPaths = new HashMap, ArrayList>(); statsMap.put(Counter.DESERIALIZE_ERRORS, deserialize_error_count); @@ -380,17 +382,17 @@ List aliases = conf.getPathToAliases().get(onefile); for (String onealias : aliases) { - Operator op = conf.getAliasToWork().get( + Operator op = conf.getAliasToWork().get( onealias); LOG.info("Adding alias " + onealias + " to work list for file " + onefile); MapInputPath inp = new MapInputPath(onefile, onealias, op); opCtxMap.put(inp, opCtx); if (operatorToPaths.get(op) == null) { - operatorToPaths.put(op, new java.util.ArrayList()); + operatorToPaths.put(op, new ArrayList()); } operatorToPaths.get(op).add(onefile); - op.setParentOperators(new ArrayList>()); + op.setParentOperators(new ArrayList>()); op.getParentOperators().add(this); // check for the operators who will process rows coming to this Map // Operator @@ -423,11 +425,11 @@ public void initializeOp(Configuration hconf) throws HiveException { // set that parent initialization is done and call initialize on children state = State.INIT; - List> children = getChildOperators(); + List> children = getChildOperators(); - for (Entry, MapOpCtx> entry : childrenOpToOpCtxMap + for (Entry, MapOpCtx> entry : childrenOpToOpCtxMap .entrySet()) { - Operator child = entry.getKey(); + Operator child = entry.getKey(); MapOpCtx mapOpCtx = entry.getValue(); // Add alias, table name, and partitions to hadoop conf so that their // children will @@ -448,12 +450,12 @@ HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, entry .getValue().partName); MapInputPath input = entry.getKey(); - Operator op = input.op; + Operator op = input.op; // op is not in the children list, so need to remember it and close it // afterwards if (children.indexOf(op) == -1) { if (extraChildrenToClose == null) { - extraChildrenToClose = new ArrayList>(); + extraChildrenToClose = new ArrayList>(); } extraChildrenToClose.add(op); op.initialize(hconf, new ObjectInspector[] {entry.getValue().getRowObjectInspector()}); @@ -467,7 +469,7 @@ @Override public void closeOp(boolean abort) throws HiveException { if (extraChildrenToClose != null) { - for (Operator op : extraChildrenToClose) { + for (Operator op : extraChildrenToClose) { op.close(abort); } } @@ -486,7 +488,7 @@ // Operator if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) { String onealias = conf.getPathToAliases().get(onefile).get(0); - Operator op = + Operator op = conf.getAliasToWork().get(onealias); LOG.info("Processing alias " + onealias + " for file " + onefile); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (working copy) @@ -40,6 +40,7 @@ import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.util.StringUtils; /** @@ -350,18 +351,18 @@ return false; } - public Collection> getTopOperators() { - return new LinkedList>(); + public Collection> getTopOperators() { + return new LinkedList>(); } - + public boolean hasReduce() { return false; } - public Operator getReducer() { + public Operator getReducer() { return null; } - + public HashMap getCounters() { return taskCounters; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecMapper.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecMapper.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecMapper.java (working copy) @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.exec; import java.io.IOException; -import java.io.Serializable; import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; import java.net.URLClassLoader; @@ -31,6 +30,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; @@ -104,8 +104,8 @@ //The following code is for mapjoin //initialize all the dummy ops l4j.info("Initializing dummy operator"); - List> dummyOps = localWork.getDummyParentOp(); - for(Operator dummyOp : dummyOps){ + List> dummyOps = localWork.getDummyParentOp(); + for(Operator dummyOp : dummyOps){ dummyOp.setExecContext(execContext); dummyOp.initialize(jc,null); } @@ -194,9 +194,9 @@ //for close the local work if(localWork != null){ - List> dummyOps = localWork.getDummyParentOp(); + List> dummyOps = localWork.getDummyParentOp(); - for(Operator dummyOp : dummyOps){ + for(Operator dummyOp : dummyOps){ dummyOp.close(abort); } } @@ -204,7 +204,7 @@ if (fetchOperators != null) { MapredLocalWork localWork = mo.getConf().getMapLocalWork(); for (Map.Entry entry : fetchOperators.entrySet()) { - Operator forwardOp = localWork + Operator forwardOp = localWork .getAliasToWork().get(entry.getKey()); forwardOp.close(abort); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.exec; -import java.io.Serializable; import java.util.ArrayList; import java.util.List; @@ -42,6 +41,7 @@ import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.UDTFDesc; import org.apache.hadoop.hive.ql.plan.UnionDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * OperatorFactory. @@ -54,7 +54,7 @@ * * @param */ - public static final class OpTuple { + public static final class OpTuple { public Class descClass; public Class> opClass; @@ -93,7 +93,7 @@ HashTableSinkOperator.class)); } - public static Operator get(Class opClass) { + public static Operator get(Class opClass) { for (OpTuple o : opvec) { if (o.descClass == opClass) { @@ -111,7 +111,7 @@ + opClass.getName()); } - public static Operator get(Class opClass, + public static Operator get(Class opClass, RowSchema rwsch) { Operator ret = get(opClass); @@ -122,36 +122,46 @@ /** * Returns an operator given the conf and a list of children operators. */ - public static Operator get(T conf, - Operator... oplist) { + public static Operator get(T conf, + Operator... oplist) { Operator ret = get((Class) conf.getClass()); ret.setConf(conf); + makeChild(ret, oplist); + return (ret); + } + + /** + * Returns an operator given the conf and a list of children operators. + */ + public static void makeChild( + Operator ret, + Operator... oplist) { if (oplist.length == 0) { - return (ret); + return; } - ArrayList> clist = new ArrayList>(); - for (Operator op : oplist) { + ArrayList> clist = + new ArrayList>(); + for (Operator op : oplist) { clist.add(op); } ret.setChildOperators(clist); // Add this parent to the children - for (Operator op : oplist) { - List> parents = op.getParentOperators(); + for (Operator op : oplist) { + List> parents = op.getParentOperators(); if (parents == null) { - parents = new ArrayList>(); + parents = new ArrayList>(); } parents.add(ret); op.setParentOperators(parents); } - return (ret); } /** * Returns an operator given the conf and a list of children operators. */ - public static Operator get(T conf, + public static Operator get(T conf, RowSchema rwsch, Operator... oplist) { Operator ret = get(conf, oplist); ret.setSchema(rwsch); @@ -161,7 +171,7 @@ /** * Returns an operator given the conf and a list of parent operators. */ - public static Operator getAndMakeChild(T conf, + public static Operator getAndMakeChild(T conf, Operator... oplist) { Operator ret = get((Class) conf.getClass()); ret.setConf(conf); @@ -180,7 +190,8 @@ } // add parents for the newly created operator - List> parent = new ArrayList>(); + List> parent = + new ArrayList>(); for (Operator op : oplist) { parent.add(op); } @@ -193,8 +204,8 @@ /** * Returns an operator given the conf and a list of parent operators. */ - public static Operator getAndMakeChild(T conf, - List> oplist) { + public static Operator getAndMakeChild(T conf, + List> oplist) { Operator ret = get((Class) conf.getClass()); ret.setConf(conf); if (oplist.size() == 0) { @@ -212,7 +223,8 @@ } // add parents for the newly created operator - List> parent = new ArrayList>(); + List> parent = + new ArrayList>(); for (Operator op : oplist) { parent.add(op); } @@ -225,7 +237,7 @@ /** * Returns an operator given the conf and a list of parent operators. */ - public static Operator getAndMakeChild(T conf, + public static Operator getAndMakeChild(T conf, RowSchema rwsch, Operator... oplist) { Operator ret = getAndMakeChild(conf, oplist); ret.setSchema(rwsch); @@ -235,8 +247,8 @@ /** * Returns an operator given the conf and a list of parent operators. */ - public static Operator getAndMakeChild(T conf, - RowSchema rwsch, List> oplist) { + public static Operator getAndMakeChild(T conf, + RowSchema rwsch, List> oplist) { Operator ret = getAndMakeChild(conf, oplist); ret.setSchema(rwsch); return (ret); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecDriver.java (working copy) @@ -50,9 +50,9 @@ import org.apache.hadoop.hive.common.LogUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.DriverContext; +import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; @@ -73,6 +73,7 @@ import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.ql.stats.StatsPublisher; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; @@ -178,7 +179,7 @@ * @return true if fatal errors happened during job execution, false otherwise. */ public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { - for (Operator op : work.getAliasToWork().values()) { + for (Operator op : work.getAliasToWork().values()) { if (op.checkFatalErrors(ctrs, errMsg)) { return true; } @@ -195,7 +196,8 @@ // fix up outputs Map> pa = work.getPathToAliases(); if (pa != null) { - ArrayList> opList = new ArrayList>(); + ArrayList> opList = + new ArrayList>(); if (work.getReducer() != null) { opList.add(work.getReducer()); @@ -206,7 +208,7 @@ opList.add(work.getAliasToWork().get(a)); while (!opList.isEmpty()) { - Operator op = opList.remove(0); + Operator op = opList.remove(0); if (op instanceof FileSinkOperator) { FileSinkDesc fdesc = ((FileSinkOperator) op).getConf(); @@ -489,7 +491,7 @@ if (rj != null) { JobCloseFeedBack feedBack = new JobCloseFeedBack(); if (work.getAliasToWork() != null) { - for (Operator op : work.getAliasToWork().values()) { + for (Operator op : work.getAliasToWork().values()) { op.jobClose(job, success, feedBack); } } @@ -743,7 +745,7 @@ } @Override - public Collection> getTopOperators() { + public Collection> getTopOperators() { return getWork().getAliasToWork().values(); } @@ -947,11 +949,12 @@ if (pa != null) { for (List ls : pa.values()) { for (String a : ls) { - ArrayList> opList = new ArrayList>(); + ArrayList> opList = + new ArrayList>(); opList.add(work.getAliasToWork().get(a)); while (!opList.isEmpty()) { - Operator op = opList.remove(0); + Operator op = opList.remove(0); if (op instanceof FileSinkOperator) { FileSinkDesc fdesc = ((FileSinkOperator) op).getConf(); @@ -973,7 +976,7 @@ @Override public void updateCounters(Counters ctrs, RunningJob rj) throws IOException { - for (Operator op : work.getAliasToWork().values()) { + for (Operator op : work.getAliasToWork().values()) { op.updateCounters(ctrs); } if (work.getReducer() != null) { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy) @@ -156,4 +156,9 @@ public OperatorType getType() { return OperatorType.FILTER; } + + @Override + public boolean supportSkewJoinOptimization() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (working copy) @@ -118,8 +118,8 @@ import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils.ExpressionTypes; -import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.ql.stats.StatsPublisher; @@ -135,8 +135,8 @@ import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.SequenceFile.CompressionType; -import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapred.FileOutputFormat; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TerminalOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/TerminalOperator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TerminalOperator.java (working copy) @@ -20,10 +20,12 @@ import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; + /** * Terminal Operator Base Class. **/ -public abstract class TerminalOperator extends +public abstract class TerminalOperator extends Operator implements Serializable { private static final long serialVersionUID = 1L; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy) @@ -96,4 +96,9 @@ public OperatorType getType() { return OperatorType.SELECT; } + + @Override + public boolean supportSkewJoinOptimization() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/MapredLocalTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapredLocalTask.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapredLocalTask.java (working copy) @@ -55,6 +55,7 @@ import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -447,7 +448,7 @@ } @Override - public Collection> getTopOperators() { + public Collection> getTopOperators() { return getWork().getAliasToWork().values(); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy) @@ -274,4 +274,9 @@ } } } + + @Override + public boolean supportSkewJoinOptimization() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -35,6 +35,7 @@ import org.apache.hadoop.hive.ql.plan.Explain; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; @@ -46,15 +47,15 @@ /** * Base operator implementation. **/ -public abstract class Operator implements Serializable, - Node { +public abstract class Operator implements Serializable,Cloneable, + Node { // Bean methods private static final long serialVersionUID = 1L; - protected List> childOperators; - protected List> parentOperators; + protected List> childOperators; + protected List> parentOperators; protected String operatorId; /** * List of counter names associated with the operator. It contains the @@ -122,11 +123,11 @@ } public void setChildOperators( - List> childOperators) { + List> childOperators) { this.childOperators = childOperators; } - public List> getChildOperators() { + public List> getChildOperators() { return childOperators; } @@ -140,7 +141,7 @@ } ArrayList ret_vec = new ArrayList(); - for (Operator op : getChildOperators()) { + for (Operator op : getChildOperators()) { ret_vec.add(op); } @@ -148,11 +149,11 @@ } public void setParentOperators( - List> parentOperators) { + List> parentOperators) { this.parentOperators = parentOperators; } - public List> getParentOperators() { + public List> getParentOperators() { return parentOperators; } @@ -231,7 +232,7 @@ return; } - for (Operator op : childOperators) { + for (Operator op : childOperators) { op.setReporter(rep); } } @@ -244,7 +245,7 @@ return; } - for (Operator op : childOperators) { + for (Operator op : childOperators) { op.setOutputCollector(out); } } @@ -259,7 +260,7 @@ return; } - for (Operator op : childOperators) { + for (Operator op : childOperators) { op.setAlias(alias); } } @@ -282,7 +283,7 @@ if (parentOperators == null) { return true; } - for (Operator parent : parentOperators) { + for (Operator parent : parentOperators) { if (parent == null) { //return true; continue; @@ -331,7 +332,7 @@ } childOperatorsTag = new int[childOperatorsArray.length]; for (int i = 0; i < childOperatorsArray.length; i++) { - List> parentOperators = childOperatorsArray[i] + List> parentOperators = childOperatorsArray[i] .getParentOperators(); if (parentOperators == null) { throw new HiveException("Hive internal error: parent is null in " @@ -361,7 +362,7 @@ public void initializeLocalWork(Configuration hconf) throws HiveException { if (childOperators != null) { for (int i =0; i childOp = this.childOperators.get(i); + Operator childOp = this.childOperators.get(i); childOp.initializeLocalWork(hconf); } } @@ -664,8 +665,8 @@ childOperators.addAll(childIndex, child.getChildOperators()); } - for (Operator gc : child.getChildOperators()) { - List> parents = gc.getParentOperators(); + for (Operator gc : child.getChildOperators()) { + List> parents = gc.getParentOperators(); int index = parents.indexOf(child); if (index == -1) { throw new SemanticException( @@ -675,7 +676,7 @@ } } - public void removeParent(Operator parent) { + public void removeParent(Operator parent) { int parentIndex = parentOperators.indexOf(parent); assert parentIndex != -1; if (parentOperators.size() == 1) { @@ -702,8 +703,8 @@ * @param newParent * the new parent */ - public void replaceParent(Operator parent, - Operator newParent) { + public void replaceParent(Operator parent, + Operator newParent) { int parentIndex = parentOperators.indexOf(parent); assert parentIndex != -1; parentOperators.set(parentIndex, newParent); @@ -755,7 +756,7 @@ int childrenDone = 0; for (int i = 0; i < childOperatorsArray.length; i++) { - Operator o = childOperatorsArray[i]; + Operator o = childOperatorsArray[i]; if (o.getDone()) { childrenDone++; } else { @@ -778,7 +779,7 @@ public void reset(){ this.state=State.INIT; if (childOperators != null) { - for (Operator o : childOperators) { + for (Operator o : childOperators) { o.reset(); } } @@ -790,13 +791,13 @@ * */ public static interface OperatorFunc { - void func(Operator op); + void func(Operator op); } public void preorderMap(OperatorFunc opFunc) { opFunc.func(this); if (childOperators != null) { - for (Operator o : childOperators) { + for (Operator o : childOperators) { o.preorderMap(opFunc); } } @@ -863,7 +864,7 @@ if (childOperators != null) { s.append(ls); s.append(" "); - for (Operator o : childOperators) { + for (Operator o : childOperators) { s.append(o.dump(level + 2, seenOpts)); } s.append(ls); @@ -873,7 +874,7 @@ if (parentOperators != null) { s.append(ls); s.append(" "); - for (Operator o : parentOperators) { + for (Operator o : parentOperators) { s.append("Id = " + o.id + " "); s.append(o.dump(level, seenOpts)); } @@ -1154,7 +1155,7 @@ // but, some operators may be updated more than once and that's ok if (getChildren() != null) { for (Node op : getChildren()) { - ((Operator) op).updateCounters(ctrs); + ((Operator) op).updateCounters(ctrs); } } } @@ -1189,7 +1190,7 @@ if (getChildren() != null) { for (Node op : getChildren()) { - if (((Operator) op).checkFatalErrors(ctrs, + if (((Operator) op).checkFatalErrors(ctrs, errMsg)) { return true; } @@ -1309,7 +1310,7 @@ this.execContext = execContext; if(this.childOperators != null) { for (int i = 0; i op = this.childOperators.get(i); + Operator op = this.childOperators.get(i); op.setExecContext(execContext); } } @@ -1321,7 +1322,7 @@ this.cleanUpInputFileChangedOp(); if(this.childOperators != null) { for (int i = 0; i op = this.childOperators.get(i); + Operator op = this.childOperators.get(i); op.cleanUpInputFileChanged(); } } @@ -1332,4 +1333,29 @@ public void cleanUpInputFileChangedOp() throws HiveException { } + public boolean supportSkewJoinOptimization() { + return false; + } + + @Override + public Operator clone() + throws CloneNotSupportedException { + + List> parents = getParentOperators(); + List> parentClones = + new ArrayList>(); + + if (parents != null) { + for (Operator parent : parents) { + parentClones.add((Operator)(parent.clone())); + } + } + + T descClone = (T)conf.clone(); + Operator ret = + (Operator) OperatorFactory.getAndMakeChild( + descClone, getSchema(), parentClones); + + return ret; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/util/AbstractSerializableCloneable.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/util/AbstractSerializableCloneable.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/util/AbstractSerializableCloneable.java (working copy) @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +import java.io.Serializable; + +public class AbstractSerializableCloneable implements SerializableCloneable { + public Object clone() throws CloneNotSupportedException { + throw new CloneNotSupportedException("clone not supported"); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/util/ObjectPair.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/util/ObjectPair.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/util/ObjectPair.java (working copy) @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +public class ObjectPair { + private F first; + private S second; + + public ObjectPair() {} + + public ObjectPair(F first, S second) { + this.first = first; + this.second = second; + } + + public F getFirst() { + return first; + } + + public void setFirst(F first) { + this.first = first; + } + + public S getSecond() { + return second; + } + + public void setSecond(S second) { + this.second = second; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/util/SerializableCloneable.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/util/SerializableCloneable.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/util/SerializableCloneable.java (working copy) @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +import java.io.Serializable; + +public interface SerializableCloneable extends Serializable, Cloneable { + public Object clone() throws CloneNotSupportedException; +} Index: ql/src/java/org/apache/hadoop/hive/ql/ppd/OpWalkerInfo.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpWalkerInfo.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpWalkerInfo.java (working copy) @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.ql.ppd; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -30,6 +29,7 @@ import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Context class for operator walker of predicate pushdown. @@ -39,23 +39,24 @@ * Operator to Pushdown Predicates Map. This keeps track of the final pushdown * predicates for each operator as you walk the Op Graph from child to parent */ - private final HashMap, ExprWalkerInfo> opToPushdownPredMap; - private final Map, OpParseContext> opToParseCtxMap; + private final HashMap, ExprWalkerInfo> + opToPushdownPredMap; + private final Map, OpParseContext> opToParseCtxMap; private final ParseContext pGraphContext; private final List candidateFilterOps; public OpWalkerInfo(ParseContext pGraphContext) { this.pGraphContext = pGraphContext; opToParseCtxMap = pGraphContext.getOpParseCtx(); - opToPushdownPredMap = new HashMap, ExprWalkerInfo>(); + opToPushdownPredMap = new HashMap, ExprWalkerInfo>(); candidateFilterOps = new ArrayList(); } - public ExprWalkerInfo getPrunedPreds(Operator op) { + public ExprWalkerInfo getPrunedPreds(Operator op) { return opToPushdownPredMap.get(op); } - public ExprWalkerInfo putPrunedPreds(Operator op, + public ExprWalkerInfo putPrunedPreds(Operator op, ExprWalkerInfo value) { return opToPushdownPredMap.put(op, value); } @@ -64,7 +65,7 @@ return opToParseCtxMap.get(op).getRowResolver(); } - public OpParseContext put(Operator key, + public OpParseContext put(Operator key, OpParseContext value) { return opToParseCtxMap.put(key, value); } Index: ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (working copy) @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.ql.ppd; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -59,6 +58,7 @@ import org.apache.hadoop.hive.ql.plan.JoinCondDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.mapred.JobConf; @@ -132,10 +132,10 @@ // SELECT(*) because that's the way that the DAG was constructed. We // only want to get the predicates from the SELECT(*). ExprWalkerInfo childPreds = owi - .getPrunedPreds((Operator) nd.getChildren() + .getPrunedPreds((Operator) nd.getChildren() .get(0)); - owi.putPrunedPreds((Operator) nd, childPreds); + owi.putPrunedPreds((Operator) nd, childPreds); return null; } @@ -173,7 +173,8 @@ LOG.info("Processing for " + nd.getName() + "(" + ((Operator) nd).getIdentifier() + ")"); OpWalkerInfo owi = (OpWalkerInfo) procCtx; - Operator op = (Operator) nd; + Operator op = + (Operator) nd; ExprNodeDesc predicate = (((FilterOperator) nd).getConf()).getPredicate(); ExprWalkerInfo ewi = new ExprWalkerInfo(); // Don't push a sampling predicate since createFilter() always creates filter @@ -186,7 +187,7 @@ /* predicate is not deterministic */ if (op.getChildren() != null && op.getChildren().size() == 1) { createFilter(op, owi - .getPrunedPreds((Operator) (op + .getPrunedPreds((Operator) (op .getChildren().get(0))), owi); } return null; @@ -199,7 +200,7 @@ } } logExpr(nd, ewi); - owi.putPrunedPreds((Operator) nd, ewi); + owi.putPrunedPreds((Operator) nd, ewi); } // merge it with children predicates boolean hasUnpushedPredicates = mergeWithChildrenPred(nd, owi, ewi, null, false); @@ -233,7 +234,7 @@ boolean hasUnpushedPredicates = mergeWithChildrenPred(nd, owi, null, null, false); ExprWalkerInfo prunePreds = - owi.getPrunedPreds((Operator) nd); + owi.getPrunedPreds((Operator) nd); if (prunePreds != null) { Set toRemove = new HashSet(); // we don't push down any expressions that refer to aliases that can;t @@ -294,7 +295,7 @@ private void applyFilterTransitivity(JoinOperator nd, OpWalkerInfo owi) throws SemanticException { ExprWalkerInfo prunePreds = - owi.getPrunedPreds((Operator) nd); + owi.getPrunedPreds((Operator) nd); if (prunePreds != null) { // We want to use the row resolvers of the parents of the join op // because the rowresolver refers to the output columns of an operator @@ -302,7 +303,7 @@ // operator. Map aliasToRR = new HashMap(); - for (Operator o : (nd).getParentOperators()) { + for (Operator o : (nd).getParentOperators()) { for (String alias : owi.getRowResolver(o).getTableNames()){ aliasToRR.put(alias, owi.getRowResolver(o)); } @@ -386,7 +387,7 @@ for (Entry> aliasToFilters : newFilters.entrySet()){ - owi.getPrunedPreds((Operator) nd) + owi.getPrunedPreds((Operator) nd) .addPushDowns(aliasToFilters.getKey(), aliasToFilters.getValue()); } } @@ -513,8 +514,9 @@ if (HiveConf.getBoolVar(owi.getParseContext().getConf(), HiveConf.ConfVars.HIVEPPDREMOVEDUPLICATEFILTERS)) { if (hasUnpushedPredicates) { - Operator op = (Operator) nd; - Operator childOperator = op.getChildOperators().get(0); + Operator op = + (Operator) nd; + Operator childOperator = op.getChildOperators().get(0); if(childOperator.getParentOperators().size()==1) { owi.getCandidateFilterOps().clear(); } @@ -587,9 +589,10 @@ // no-op for leafs return hasUnpushedPredicates; } - Operator op = (Operator) nd; + Operator op = + (Operator) nd; ExprWalkerInfo childPreds = owi - .getPrunedPreds((Operator) nd.getChildren() + .getPrunedPreds((Operator) nd.getChildren() .get(0)); if (childPreds == null) { return hasUnpushedPredicates; @@ -614,7 +617,7 @@ hasUnpushedPredicates = true; } } - owi.putPrunedPreds((Operator) nd, ewi); + owi.putPrunedPreds((Operator) nd, ewi); return hasUnpushedPredicates; } @@ -624,9 +627,9 @@ if (nd.getChildren() == null) { return null; } - Operator op = (Operator) nd; + Operator op = (Operator)nd; ExprWalkerInfo ewi = new ExprWalkerInfo(); - for (Operator child : op.getChildOperators()) { + for (Operator child : op.getChildOperators()) { ExprWalkerInfo childPreds = owi.getPrunedPreds(child); if (childPreds == null) { continue; @@ -698,15 +701,15 @@ } // add new filter op - List> originalChilren = op + List> originalChilren = op .getChildOperators(); op.setChildOperators(null); Operator output = OperatorFactory.getAndMakeChild( new FilterDesc(condn, false), new RowSchema(inputRR.getColumnInfos()), op); output.setChildOperators(originalChilren); - for (Operator ch : originalChilren) { - List> parentOperators = ch + for (Operator ch : originalChilren) { + List> parentOperators = ch .getParentOperators(); int pos = parentOperators.indexOf(op); assert pos != -1; @@ -720,13 +723,13 @@ HiveConf.ConfVars.HIVEPPDREMOVEDUPLICATEFILTERS)) { // remove the candidate filter ops for (FilterOperator fop : owi.getCandidateFilterOps()) { - List> children = fop.getChildOperators(); - List> parents = fop.getParentOperators(); - for (Operator parent : parents) { + List> children = fop.getChildOperators(); + List> parents = fop.getParentOperators(); + for (Operator parent : parents) { parent.getChildOperators().addAll(children); parent.removeChild(fop); } - for (Operator child : children) { + for (Operator child : children) { child.getParentOperators().addAll(parents); child.removeParent(fop); } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ScriptDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ScriptDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ScriptDesc.java (working copy) @@ -18,17 +18,16 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; - import org.apache.hadoop.hive.ql.exec.RecordReader; import org.apache.hadoop.hive.ql.exec.RecordWriter; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * ScriptDesc. * */ @Explain(displayName = "Transform Operator") -public class ScriptDesc implements Serializable { +public class ScriptDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private String scriptCmd; // Describe how to deserialize data back from user script Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ForwardDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ForwardDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ForwardDesc.java (working copy) @@ -18,16 +18,21 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * ForwardDesc. * */ @Explain(displayName = "Forward") -public class ForwardDesc implements Serializable { +public class ForwardDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; public ForwardDesc() { } + + @Override + public ForwardDesc clone() { + return new ForwardDesc(); + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/FilterDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/FilterDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/FilterDesc.java (working copy) @@ -18,20 +18,21 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; import java.util.List; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; + /** * FilterDesc. * */ @Explain(displayName = "Filter Operator") -public class FilterDesc implements Serializable { +public class FilterDesc extends AbstractSerializableCloneable { /** * sampleDesc is used to keep track of the sampling descriptor. */ - public static class sampleDesc { + public static class sampleDesc implements Cloneable { // The numerator of the TABLESAMPLE clause private int numerator; @@ -62,6 +63,12 @@ public boolean getInputPruning() { return inputPruning; } + + @Override + public Object clone() { + sampleDesc desc = new sampleDesc(numerator, denominator, null, inputPruning); + return desc; + } } private static final long serialVersionUID = 1L; @@ -126,4 +133,13 @@ this.isSortedFilter = isSortedFilter; } + @Override + public Object clone() { + FilterDesc filterDesc = new FilterDesc(getPredicate().clone(), getIsSamplingPred()); + if (getIsSamplingPred()) { + filterDesc.setSampleDescr(getSampleDescr()); + } + filterDesc.setSortedFilter(isSortedFilter()); + return filterDesc; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ReduceSinkDesc.java (working copy) @@ -18,15 +18,17 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; + /** * ReduceSinkDesc. * */ @Explain(displayName = "Reduce Output Operator") -public class ReduceSinkDesc implements Serializable { +public class ReduceSinkDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; /** * Key columns are passed to reducer in the "key". @@ -91,6 +93,29 @@ this.distinctColumnIndices = distinctColumnIndices; } + @Override + public Object clone() { + ReduceSinkDesc desc = new ReduceSinkDesc(); + desc.setKeyCols((ArrayList) getKeyCols().clone()); + desc.setValueCols((ArrayList) getValueCols().clone()); + desc.setOutputKeyColumnNames((ArrayList) getOutputKeyColumnNames().clone()); + List> distinctColumnIndicesClone = new ArrayList>(); + for (List distinctColumnIndex : getDistinctColumnIndices()) { + List tmp = new ArrayList(); + tmp.addAll(distinctColumnIndex); + distinctColumnIndicesClone.add(tmp); + } + desc.setDistinctColumnIndices(distinctColumnIndicesClone); + desc.setOutputValueColumnNames((ArrayList) getOutputValueColumnNames().clone()); + desc.setNumDistributionKeys(getNumDistributionKeys()); + desc.setTag(getTag()); + desc.setNumReducers(getNumReducers()); + desc.setPartitionCols((ArrayList) getPartitionCols().clone()); + desc.setKeySerializeInfo((TableDesc) getKeySerializeInfo().clone()); + desc.setValueSerializeInfo((TableDesc) getValueSerializeInfo().clone()); + return desc; + } + public java.util.ArrayList getOutputKeyColumnNames() { return outputKeyColumnNames; } @@ -186,7 +211,7 @@ /** * Returns the sort order of the key columns. - * + * * @return null, which means ascending order for all key columns, or a String * of the same length as key columns, that consists of only "+" * (ascending order) and "-" (descending order). @@ -196,7 +221,7 @@ return keySerializeInfo.getProperties().getProperty( org.apache.hadoop.hive.serde.Constants.SERIALIZATION_SORT_ORDER); } - + public void setOrder(String orderStr) { keySerializeInfo.getProperties().setProperty( org.apache.hadoop.hive.serde.Constants.SERIALIZATION_SORT_ORDER, Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ListSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ListSinkDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ListSinkDesc.java (working copy) @@ -18,13 +18,13 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * description for ListSinkOperator, just for explain result. */ @Explain(displayName = "ListSink") -public class ListSinkDesc implements Serializable { +public class ListSinkDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/LateralViewForwardDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/LateralViewForwardDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/LateralViewForwardDesc.java (working copy) @@ -18,14 +18,14 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * LateralViewForwardDesc. * */ @Explain(displayName = "Lateral View Forward") -public class LateralViewForwardDesc implements Serializable { +public class LateralViewForwardDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; public LateralViewForwardDesc() { Index: ql/src/java/org/apache/hadoop/hive/ql/plan/LimitDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/LimitDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/LimitDesc.java (working copy) @@ -18,14 +18,14 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * LimitDesc. * */ @Explain(displayName = "Limit") -public class LimitDesc implements Serializable { +public class LimitDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private int limit; private int leastRows = -1; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredLocalWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredLocalWork.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredLocalWork.java (working copy) @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * MapredLocalWork. @@ -33,22 +34,22 @@ public class MapredLocalWork implements Serializable { private static final long serialVersionUID = 1L; - private LinkedHashMap> aliasToWork; + private LinkedHashMap> aliasToWork; private LinkedHashMap aliasToFetchWork; private boolean inputFileChangeSensitive; private BucketMapJoinContext bucketMapjoinContext; private String tmpFileURI; private String stageID; - private List> dummyParentOp ; + private List> dummyParentOp ; public MapredLocalWork() { } public MapredLocalWork( - final LinkedHashMap> aliasToWork, - final LinkedHashMap aliasToFetchWork) { + final LinkedHashMap> aliasToWork, + final LinkedHashMap aliasToFetchWork) { this.aliasToWork = aliasToWork; this.aliasToFetchWork = aliasToFetchWork; @@ -61,18 +62,18 @@ } - public void setDummyParentOp(List> op){ + public void setDummyParentOp(List> op){ this.dummyParentOp=op; } - public List> getDummyParentOp(){ + public List> getDummyParentOp(){ return this.dummyParentOp; } @Explain(displayName = "Alias -> Map Local Operator Tree") - public LinkedHashMap> getAliasToWork() { + public LinkedHashMap> getAliasToWork() { return aliasToWork; } @@ -85,7 +86,7 @@ } public void setAliasToWork( - final LinkedHashMap> aliasToWork) { + final LinkedHashMap> aliasToWork) { this.aliasToWork = aliasToWork; } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (working copy) @@ -18,17 +18,17 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; import java.util.ArrayList; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * FileSinkDesc. * */ @Explain(displayName = "File Output Operator") -public class FileSinkDesc implements Serializable { +public class FileSinkDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private String dirName; // normally statsKeyPref will be the same as dirName, but the latter Index: ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -46,6 +45,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.DelimitedJSONSerDe; import org.apache.hadoop.hive.serde2.Deserializer; @@ -88,7 +88,7 @@ try { return new MapredWork("", new LinkedHashMap>(), new LinkedHashMap(), - new LinkedHashMap>(), + new LinkedHashMap>(), new TableDesc(), new ArrayList(), null, Integer.valueOf(1), null, Hive.get().getConf().getBoolVar( HiveConf.ConfVars.HIVE_COMBINE_INPUT_FORMAT_SUPPORTS_SPLITTABLE)); Index: ql/src/java/org/apache/hadoop/hive/ql/plan/SelectDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/SelectDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/SelectDesc.java (working copy) @@ -18,17 +18,19 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import java.util.ArrayList; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; + /** * SelectDesc. * */ @Explain(displayName = "Select Operator") -public class SelectDesc implements Serializable { +public class SelectDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; - private java.util.ArrayList colList; - private java.util.ArrayList outputColumnNames; + private ArrayList colList; + private ArrayList outputColumnNames; private boolean selectStar; private boolean selStarNoCompute; @@ -40,45 +42,55 @@ } public SelectDesc( - final java.util.ArrayList colList, - final java.util.ArrayList outputColumnNames) { + final ArrayList colList, + final ArrayList outputColumnNames) { this(colList, outputColumnNames, false); } public SelectDesc( - final java.util.ArrayList colList, - java.util.ArrayList outputColumnNames, - final boolean selectStar) { + final ArrayList colList, + ArrayList outputColumnNames, + final boolean selectStar) { this.colList = colList; this.selectStar = selectStar; this.outputColumnNames = outputColumnNames; } public SelectDesc( - final java.util.ArrayList colList, - final boolean selectStar, final boolean selStarNoCompute) { + final ArrayList colList, + final boolean selectStar, final boolean selStarNoCompute) { this.colList = colList; this.selectStar = selectStar; this.selStarNoCompute = selStarNoCompute; } + @Override + public Object clone() { + SelectDesc ret = new SelectDesc(); + ret.setColList((ArrayList)getColList().clone()); + ret.setOutputColumnNames((ArrayList)getOutputColumnNames().clone()); + ret.setSelectStar(selectStar); + ret.setSelStarNoCompute(selStarNoCompute); + return ret; + } + @Explain(displayName = "expressions") - public java.util.ArrayList getColList() { + public ArrayList getColList() { return colList; } public void setColList( - final java.util.ArrayList colList) { + final ArrayList colList) { this.colList = colList; } @Explain(displayName = "outputColumnNames") - public java.util.ArrayList getOutputColumnNames() { + public ArrayList getOutputColumnNames() { return outputColumnNames; } public void setOutputColumnNames( - java.util.ArrayList outputColumnNames) { + ArrayList outputColumnNames) { this.outputColumnNames = outputColumnNames; } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (working copy) @@ -22,13 +22,14 @@ import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * GroupByDesc. * */ @Explain(displayName = "Group By Operator") -public class GroupByDesc implements java.io.Serializable { +public class GroupByDesc extends AbstractSerializableCloneable { /** * Group-by Mode: COMPLETE: complete 1-phase aggregation: iterate, terminate * PARTIAL1: partial aggregation - first phase: iterate, terminatePartial @@ -54,9 +55,9 @@ private boolean groupKeyNotReductionKey; private boolean bucketGroup; - private java.util.ArrayList keys; - private java.util.ArrayList aggregators; - private java.util.ArrayList outputColumnNames; + private ArrayList keys; + private ArrayList aggregators; + private ArrayList outputColumnNames; private float groupByMemoryUsage; private float memoryThreshold; @@ -65,9 +66,9 @@ public GroupByDesc( final Mode mode, - final java.util.ArrayList outputColumnNames, - final java.util.ArrayList keys, - final java.util.ArrayList aggregators, + final ArrayList outputColumnNames, + final ArrayList keys, + final ArrayList aggregators, final boolean groupKeyNotReductionKey,float groupByMemoryUsage, float memoryThreshold) { this(mode, outputColumnNames, keys, aggregators, groupKeyNotReductionKey, false, groupByMemoryUsage, memoryThreshold); @@ -75,9 +76,9 @@ public GroupByDesc( final Mode mode, - final java.util.ArrayList outputColumnNames, - final java.util.ArrayList keys, - final java.util.ArrayList aggregators, + final ArrayList outputColumnNames, + final ArrayList keys, + final ArrayList aggregators, final boolean groupKeyNotReductionKey, final boolean bucketGroup,float groupByMemoryUsage, float memoryThreshold) { this.mode = mode; this.outputColumnNames = outputColumnNames; @@ -120,21 +121,21 @@ } @Explain(displayName = "keys") - public java.util.ArrayList getKeys() { + public ArrayList getKeys() { return keys; } - public void setKeys(final java.util.ArrayList keys) { + public void setKeys(final ArrayList keys) { this.keys = keys; } @Explain(displayName = "outputColumnNames") - public java.util.ArrayList getOutputColumnNames() { + public ArrayList getOutputColumnNames() { return outputColumnNames; } public void setOutputColumnNames( - java.util.ArrayList outputColumnNames) { + ArrayList outputColumnNames) { this.outputColumnNames = outputColumnNames; } @@ -155,12 +156,12 @@ } @Explain(displayName = "aggregations") - public java.util.ArrayList getAggregators() { + public ArrayList getAggregators() { return aggregators; } public void setAggregators( - final java.util.ArrayList aggregators) { + final ArrayList aggregators) { this.aggregators = aggregators; } @@ -180,7 +181,7 @@ public void setBucketGroup(boolean dataSorted) { bucketGroup = dataSorted; } - + /** * Checks if this grouping is like distinct, which means that all non-distinct grouping * columns behave like they were distinct - for example min and max operators. Index: ql/src/java/org/apache/hadoop/hive/ql/plan/UDTFDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/UDTFDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/UDTFDesc.java (working copy) @@ -18,18 +18,17 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; - import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * All member variables should have a setters and getters of the form get and set or else they won't be recreated properly at run * time. - * + * */ @Explain(displayName = "UDTF Operator") -public class UDTFDesc implements Serializable { +public class UDTFDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private GenericUDTF genericUDTF; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/UnionDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/UnionDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/UnionDesc.java (working copy) @@ -18,14 +18,14 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * unionDesc is a empty class currently. However, union has more than one input * (as compared with forward), and therefore, we need a separate class. **/ @Explain(displayName = "Union") -public class UnionDesc implements Serializable { +public class UnionDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private transient int numInputs; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java (working copy) @@ -18,19 +18,21 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; + /** * Join operator Descriptor implementation. * */ @Explain(displayName = "Join Operator") -public class JoinDesc implements Serializable { +public class JoinDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; public static final int INNER_JOIN = 0; public static final int LEFT_OUTER_JOIN = 1; @@ -87,6 +89,53 @@ } } + @Override + public Object clone() { + JoinDesc ret = new JoinDesc(); + Map> cloneExprs = new HashMap>(); + cloneExprs.putAll(getExprs()); + ret.setExprs(cloneExprs); + Map> cloneFilters = new HashMap>(); + cloneFilters.putAll(getFilters()); + ret.setFilters(cloneFilters); + ret.setConds(getConds().clone()); + ret.setNoOuterJoin(getNoOuterJoin()); + ret.setNullSafes(getNullSafes()); + ret.setHandleSkewJoin(handleSkewJoin); + ret.setSkewKeyDefinition(getSkewKeyDefinition()); + ret.setTagOrder(getTagOrder().clone()); + if (getKeyTableDesc() != null) { + ret.setKeyTableDesc((TableDesc) getKeyTableDesc().clone()); + } + + if (getBigKeysDirMap() != null) { + Map cloneBigKeysDirMap = new HashMap(); + cloneBigKeysDirMap.putAll(getBigKeysDirMap()); + ret.setBigKeysDirMap(cloneBigKeysDirMap); + } + if (getSmallKeysDirMap() != null) { + Map> cloneSmallKeysDirMap = new HashMap> (); + cloneSmallKeysDirMap.putAll(getSmallKeysDirMap()); + ret.setSmallKeysDirMap(cloneSmallKeysDirMap); + } + if (getSkewKeysValuesTables() != null) { + Map cloneSkewKeysValuesTables = new HashMap(); + cloneSkewKeysValuesTables.putAll(getSkewKeysValuesTables()); + ret.setSkewKeysValuesTables(cloneSkewKeysValuesTables); + } + if (getOutputColumnNames() != null) { + List cloneOutputColumnNames = new ArrayList(); + cloneOutputColumnNames.addAll(getOutputColumnNames()); + ret.setOutputColumnNames(cloneOutputColumnNames); + } + if (getReversedExprs() != null) { + Map cloneReversedExprs = new HashMap(); + cloneReversedExprs.putAll(getReversedExprs()); + ret.setReversedExprs(cloneReversedExprs); + } + return ret; + } + public JoinDesc(final Map> exprs, List outputColumnNames, final boolean noOuterJoin, final JoinCondDesc[] conds) { Index: ql/src/java/org/apache/hadoop/hive/ql/plan/HashTableDummyDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/HashTableDummyDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/HashTableDummyDesc.java (working copy) @@ -18,13 +18,13 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * HashTable Dummy Descriptor implementation. * */ @Explain(displayName = "HashTable Dummy Operator") -public class HashTableDummyDesc implements Serializable { +public class HashTableDummyDesc extends AbstractSerializableCloneable { private TableDesc tbl; public TableDesc getTbl() { Index: ql/src/java/org/apache/hadoop/hive/ql/plan/CollectDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/CollectDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/CollectDesc.java (working copy) @@ -18,14 +18,14 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * CollectDesc. * */ @Explain(displayName = "Collect") -public class CollectDesc implements Serializable { +public class CollectDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; Integer bufferSize; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/LateralViewJoinDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/LateralViewJoinDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/LateralViewJoinDesc.java (working copy) @@ -18,15 +18,16 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; import java.util.ArrayList; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; + /** * LateralViewJoinDesc. * */ @Explain(displayName = "Lateral View Join Operator") -public class LateralViewJoinDesc implements Serializable { +public class LateralViewJoinDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private ArrayList outputInternalColNames; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExtractDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExtractDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExtractDesc.java (working copy) @@ -18,14 +18,14 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * ExtractDesc. * */ @Explain(displayName = "Extract") -public class ExtractDesc implements Serializable { +public class ExtractDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private ExprNodeDesc col; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (working copy) @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.plan; import java.io.ByteArrayOutputStream; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; @@ -33,13 +32,15 @@ import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SplitSample; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * MapredWork. * */ @Explain(displayName = "Map Reduce") -public class MapredWork implements Serializable { +public class MapredWork extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private String command; // map side work @@ -49,7 +50,7 @@ private LinkedHashMap pathToPartitionInfo; - private LinkedHashMap> aliasToWork; + private LinkedHashMap> aliasToWork; private LinkedHashMap aliasToPartnInfo; @@ -81,7 +82,7 @@ private String tmpHDFSFileURI; - private LinkedHashMap, OpParseContext> opParseCtxMap; + private LinkedHashMap, OpParseContext> opParseCtxMap; private QBJoinTree joinTree; @@ -100,7 +101,7 @@ final String command, final LinkedHashMap> pathToAliases, final LinkedHashMap pathToPartitionInfo, - final LinkedHashMap> aliasToWork, + final LinkedHashMap> aliasToWork, final TableDesc keyDesc, List tagToValueDesc, final Operator reducer, final Integer numReduceTasks, final MapredLocalWork mapLocalWork, @@ -167,12 +168,12 @@ } @Explain(displayName = "Alias -> Map Operator Tree") - public LinkedHashMap> getAliasToWork() { + public LinkedHashMap> getAliasToWork() { return aliasToWork; } public void setAliasToWork( - final LinkedHashMap> aliasToWork) { + final LinkedHashMap> aliasToWork) { this.aliasToWork = aliasToWork; } @@ -433,12 +434,13 @@ this.joinTree = joinTree; } - public LinkedHashMap, OpParseContext> getOpParseCtxMap() { + public + LinkedHashMap, OpParseContext> getOpParseCtxMap() { return opParseCtxMap; } public void setOpParseCtxMap( - LinkedHashMap, OpParseContext> opParseCtxMap) { + LinkedHashMap, OpParseContext> opParseCtxMap) { this.opParseCtxMap = opParseCtxMap; } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java (working copy) @@ -18,10 +18,11 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.Serializable; +import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.util.AbstractSerializableCloneable; /** * Table Scan Descriptor Currently, data is only read from a base source as part @@ -29,7 +30,7 @@ * things will be added here as table scan is invoked as part of local work. **/ @Explain(displayName = "TableScan") -public class TableScanDesc implements Serializable { +public class TableScanDesc extends AbstractSerializableCloneable { private static final long serialVersionUID = 1L; private String alias; @@ -71,6 +72,12 @@ this.virtualCols = vcs; } + @Override + public Object clone() { + List vcs = new ArrayList(getVirtualCols()); + return new TableScanDesc(getAlias(), vcs); + } + @Explain(displayName = "alias") public String getAlias() { return alias; Index: ql/src/java/org/apache/hadoop/hive/ql/QueryPlan.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/QueryPlan.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/QueryPlan.java (working copy) @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.ql.plan.api.AdjacencyType; import org.apache.hadoop.hive.ql.plan.api.NodeType; import org.apache.hadoop.hive.ql.plan.api.TaskType; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.protocol.TJSONProtocol; @@ -152,18 +153,18 @@ */ private void populateOperatorGraph( org.apache.hadoop.hive.ql.plan.api.Task task, - Collection> topOps) { + Collection> topOps) { task.setOperatorGraph(new org.apache.hadoop.hive.ql.plan.api.Graph()); task.getOperatorGraph().setNodeType(NodeType.OPERATOR); - Queue> opsToVisit = - new LinkedList>(); - Set> opsVisited = - new HashSet>(); + Queue> opsToVisit = + new LinkedList>(); + Set> opsVisited = + new HashSet>(); opsToVisit.addAll(topOps); while (opsToVisit.peek() != null) { - Operator op = opsToVisit.remove(); + Operator op = opsToVisit.remove(); opsVisited.add(op); // populate the operator org.apache.hadoop.hive.ql.plan.api.Operator operator = @@ -177,7 +178,7 @@ new org.apache.hadoop.hive.ql.plan.api.Adjacency(); entry.setAdjacencyType(AdjacencyType.CONJUNCTIVE); entry.setNode(op.getOperatorId()); - for (Operator childOp : op.getChildOperators()) { + for (Operator childOp : op.getChildOperators()) { entry.addToChildren(childOp.getOperatorId()); if (!opsVisited.contains(childOp)) { opsToVisit.add(childOp); @@ -230,8 +231,8 @@ reduceTask.setTaskId(stage.getStageId() + "_REDUCE"); reduceTask.setTaskType(TaskType.REDUCE); stage.addToTaskList(reduceTask); - Collection> reducerTopOps = - new ArrayList>(); + Collection> reducerTopOps = + new ArrayList>(); reducerTopOps.add(mrTask.getWork().getReducer()); populateOperatorGraph(reduceTask, reducerTopOps); } @@ -309,14 +310,16 @@ } else { task.setStarted(started.contains(task.getTaskId())); task.setDone(done.contains(task.getTaskId())); - for (org.apache.hadoop.hive.ql.plan.api.Operator op : task - .getOperatorList()) { - // if the task has started, all operators within the task have - // started - op.setStarted(started.contains(task.getTaskId())); - op.setOperatorCounters(counters.get(op.getOperatorId())); - // if the task is done, all operators are done as well - op.setDone(done.contains(task.getTaskId())); + if (task.getOperatorList() != null) { + for (org.apache.hadoop.hive.ql.plan.api.Operator op : + task.getOperatorList()) { + // if the task has started, all operators within the task have + // started + op.setStarted(started.contains(task.getTaskId())); + op.setOperatorCounters(counters.get(op.getOperatorId())); + // if the task is done, all operators are done as well + op.setDone(done.contains(task.getTaskId())); + } } } } @@ -370,8 +373,8 @@ done.add(task.getId() + "_MAP"); } if (mrTask.hasReduce()) { - Collection> reducerTopOps = - new ArrayList>(); + Collection> reducerTopOps = + new ArrayList>(); reducerTopOps.add(mrTask.getWork().getReducer()); extractOperatorCounters(reducerTopOps, task.getId() + "_REDUCE"); if (mrTask.reduceStarted()) { @@ -393,21 +396,21 @@ } private void extractOperatorCounters( - Collection> topOps, String taskId) { - Queue> opsToVisit = - new LinkedList>(); - Set> opsVisited = - new HashSet>(); + Collection> topOps, String taskId) { + Queue> opsToVisit = + new LinkedList>(); + Set> opsVisited = + new HashSet>(); opsToVisit.addAll(topOps); while (opsToVisit.size() != 0) { - Operator op = opsToVisit.remove(); + Operator op = opsToVisit.remove(); opsVisited.add(op); counters.put(op.getOperatorId(), op.getCounters()); if (op.getDone()) { done.add(op.getOperatorId()); } if (op.getChildOperators() != null) { - for (Operator childOp : op.getChildOperators()) { + for (Operator childOp : op.getChildOperators()) { if (!opsVisited.contains(childOp)) { opsToVisit.add(childOp); } Index: ql/src/java/org/apache/hadoop/hive/ql/lib/DefaultGraphWalker.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/lib/DefaultGraphWalker.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/lib/DefaultGraphWalker.java (working copy) @@ -21,7 +21,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.Stack; @@ -43,7 +42,7 @@ /** * Constructor. - * + * * @param disp * dispatcher to call for each op encountered */ @@ -68,7 +67,7 @@ /** * Dispatch the current operator. - * + * * @param nd * node being walked * @param ndStack @@ -91,7 +90,7 @@ /** * starting point for walking. - * + * * @throws SemanticException */ public void startWalking(Collection startNodes, @@ -108,7 +107,7 @@ /** * walk the current operator and its descendants. - * + * * @param nd * current operator in the graph * @throws SemanticException @@ -122,7 +121,7 @@ || getDispatchedList().containsAll(nd.getChildren())) { // all children are done or no need to walk the children if (!getDispatchedList().contains(nd)) { - dispatch(nd, opStack); + dispatch(nd, opStack); } opStack.pop(); return; Index: ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java (working copy) @@ -219,7 +219,7 @@ null != qbm.getAliasToTable() && qbm.getAliasToTable().size() > 0) { Table tbl = getMetaData().getTableForAlias(alias); - skewedColNames = tbl.getSkewedColName(); + skewedColNames = tbl.getSkewedColNames(); } return skewedColNames; } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -157,6 +157,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe; @@ -178,9 +179,9 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { private HashMap opToPartPruner; private HashMap opToPartList; - private HashMap> topOps; - private HashMap> topSelOps; - private LinkedHashMap, OpParseContext> opParseCtx; + private HashMap> topOps; + private HashMap> topSelOps; + private LinkedHashMap, OpParseContext> opParseCtx; private List loadTableWork; private List loadFileWork; private Map joinContext; @@ -225,11 +226,11 @@ opToPartList = new HashMap(); opToSamplePruner = new HashMap(); nameToSplitSample = new HashMap(); - topOps = new HashMap>(); - topSelOps = new HashMap>(); + topOps = new HashMap>(); + topSelOps = new HashMap>(); loadTableWork = new ArrayList(); loadFileWork = new ArrayList(); - opParseCtx = new LinkedHashMap, OpParseContext>(); + opParseCtx = new LinkedHashMap, OpParseContext>(); joinContext = new HashMap(); topToTable = new HashMap(); destTableId = 1; @@ -1467,7 +1468,7 @@ } @SuppressWarnings("nls") - public Operator putOpInsertMap(Operator op, + public Operator putOpInsertMap(Operator op, RowResolver rr) { OpParseContext ctx = new OpParseContext(rr); opParseCtx.put(op, ctx); @@ -6403,12 +6404,12 @@ if ((leftOp instanceof UnionOperator) || (rightOp instanceof UnionOperator)) { if (leftOp instanceof UnionOperator) { // make left a child of right - List> child = - new ArrayList>(); + List> child = + new ArrayList>(); child.add(leftOp); rightOp.setChildOperators(child); - List> parent = leftOp + List> parent = leftOp .getParentOperators(); parent.add(rightOp); @@ -6417,12 +6418,12 @@ return putOpInsertMap(leftOp, unionoutRR); } else { // make right a child of left - List> child = - new ArrayList>(); + List> child = + new ArrayList>(); child.add(rightOp); leftOp.setChildOperators(child); - List> parent = rightOp + List> parent = rightOp .getParentOperators(); parent.add(leftOp); UnionDesc uDesc = ((UnionOperator) rightOp).getConf(); @@ -6433,22 +6434,22 @@ } // Create a new union operator - Operator unionforward = OperatorFactory + Operator unionforward = OperatorFactory .getAndMakeChild(new UnionDesc(), new RowSchema(unionoutRR .getColumnInfos())); // set union operator as child of each of leftOp and rightOp - List> child = - new ArrayList>(); + List> child = + new ArrayList>(); child.add(unionforward); rightOp.setChildOperators(child); - child = new ArrayList>(); + child = new ArrayList>(); child.add(unionforward); leftOp.setChildOperators(child); - List> parent = - new ArrayList>(); + List> parent = + new ArrayList>(); parent.add(leftOp); parent.add(rightOp); unionforward.setParentOperators(parent); @@ -6554,8 +6555,8 @@ RowResolver rwsch; // is the table already present - Operator top = topOps.get(alias_id); - Operator dummySel = topSelOps.get(alias_id); + Operator top = topOps.get(alias_id); + Operator dummySel = topSelOps.get(alias_id); if (dummySel != null) { top = dummySel; } @@ -6607,7 +6608,7 @@ setupStats(tsDesc, qb.getParseInfo(), tab, alias, rwsch); top = putOpInsertMap(OperatorFactory.get(tsDesc, - new RowSchema(rwsch.getColumnInfos())), rwsch); + new RowSchema(rwsch.getColumnInfos())), rwsch); // Add this to the list of top operators - we always start from a table // scan @@ -6621,7 +6622,7 @@ } // check if this table is sampled and needs more than input pruning - Operator tableOp = top; + Operator tableOp = top; TableSample ts = qb.getParseInfo().getTabSample(alias); if (ts != null) { int num = ts.getNumerator(); @@ -7129,10 +7130,10 @@ ParseContext tempParseContext = getParseContext(); GenMRProcContext procCtx = new GenMRProcContext( conf, - new HashMap, Task>(), - new ArrayList>(), tempParseContext, + new HashMap, Task>(), + new ArrayList>(), tempParseContext, mvTask, rootTasks, - new LinkedHashMap, GenMapRedCtx>(), + new LinkedHashMap, GenMapRedCtx>(), inputs, outputs); // create a walker which walks the tree in a DFS manner while maintaining @@ -7275,15 +7276,15 @@ // loop over all the tasks recursviely private void generateCountersTask(Task task) { if (task instanceof ExecDriver) { - HashMap> opMap = ((MapredWork) task + HashMap> opMap = ((MapredWork) task .getWork()).getAliasToWork(); if (!opMap.isEmpty()) { - for (Operator op : opMap.values()) { + for (Operator op : opMap.values()) { generateCountersOperator(op); } } - Operator reducer = ((MapredWork) task.getWork()) + Operator reducer = ((MapredWork) task.getWork()) .getReducer(); if (reducer != null) { LOG.info("Generating counters for operator " + reducer); @@ -7309,14 +7310,14 @@ } } - private void generateCountersOperator(Operator op) { + private void generateCountersOperator(Operator op) { op.assignCounterNameToEnum(); if (op.getChildOperators() == null) { return; } - for (Operator child : op.getChildOperators()) { + for (Operator child : op.getChildOperators()) { generateCountersOperator(child); } } @@ -7325,10 +7326,10 @@ private void breakTaskTree(Task task) { if (task instanceof ExecDriver) { - HashMap> opMap = ((MapredWork) task + HashMap> opMap = ((MapredWork) task .getWork()).getAliasToWork(); if (!opMap.isEmpty()) { - for (Operator op : opMap.values()) { + for (Operator op : opMap.values()) { breakOperatorTree(op); } } @@ -7350,7 +7351,7 @@ } // loop over all the operators recursviely - private void breakOperatorTree(Operator topOp) { + private void breakOperatorTree(Operator topOp) { if (topOp instanceof ReduceSinkOperator) { topOp.setChildOperators(null); } @@ -7359,7 +7360,7 @@ return; } - for (Operator op : topOp.getChildOperators()) { + for (Operator op : topOp.getChildOperators()) { breakOperatorTree(op); } } @@ -7370,10 +7371,10 @@ if (task instanceof ExecDriver) { MapredWork work = (MapredWork) task.getWork(); work.deriveExplainAttributes(); - HashMap> opMap = work + HashMap> opMap = work .getAliasToWork(); if (!opMap.isEmpty()) { - for (Operator op : opMap.values()) { + for (Operator op : opMap.values()) { GenMapRedUtils.setKeyAndValueDesc(work, op); } } @@ -8334,7 +8335,7 @@ /** * This code is commented out pending further testing/development - * for (Task t: rootTasks) + * for (Task t: rootTasks) * t.localizeMRTmpFiles(ctx); */ } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (working copy) @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.ql.plan.LoadFileDesc; import org.apache.hadoop.hive.ql.plan.LoadTableDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; /** * Parse Context: The current parse context. This is passed to the optimizer @@ -65,9 +66,9 @@ private HashMap opToPartList; private HashMap opToSamplePruner; private Map opToSkewedPruner; - private HashMap> topOps; - private HashMap> topSelOps; - private LinkedHashMap, OpParseContext> opParseCtx; + private HashMap> topOps; + private HashMap> topSelOps; + private LinkedHashMap, OpParseContext> opParseCtx; private Map joinContext; private Map mapJoinContext; private HashMap topToTable; @@ -154,9 +155,9 @@ ASTNode ast, HashMap opToPartPruner, HashMap opToPartList, - HashMap> topOps, - HashMap> topSelOps, - LinkedHashMap, OpParseContext> opParseCtx, + HashMap> topOps, + HashMap> topSelOps, + LinkedHashMap, OpParseContext> opParseCtx, Map joinContext, HashMap topToTable, List loadTableWork, List loadFileWork, @@ -299,7 +300,7 @@ /** * @return the topOps */ - public HashMap> getTopOps() { + public HashMap> getTopOps() { return topOps; } @@ -307,14 +308,14 @@ * @param topOps * the topOps to set */ - public void setTopOps(HashMap> topOps) { + public void setTopOps(HashMap> topOps) { this.topOps = topOps; } /** * @return the topSelOps */ - public HashMap> getTopSelOps() { + public HashMap> getTopSelOps() { return topSelOps; } @@ -323,14 +324,14 @@ * the topSelOps to set */ public void setTopSelOps( - HashMap> topSelOps) { + HashMap> topSelOps) { this.topSelOps = topSelOps; } /** * @return the opParseCtx */ - public LinkedHashMap, OpParseContext> getOpParseCtx() { + public LinkedHashMap, OpParseContext> getOpParseCtx() { return opParseCtx; } @@ -339,7 +340,7 @@ * the opParseCtx to set */ public void setOpParseCtx( - LinkedHashMap, OpParseContext> opParseCtx) { + LinkedHashMap, OpParseContext> opParseCtx) { this.opParseCtx = opParseCtx; } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java (working copy) @@ -25,7 +25,12 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.ErrorMsg; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.hive.ql.metadata.Table; + + /** * Library of utility functions used in the parse code. * @@ -96,4 +101,24 @@ return colNames; } + /** + * Retrieve the Table instance from ParseContext. + * @param ctx + * parse context + * @param tableAlias + * table name + * @return + */ + public static Table getTableFromParseContext(ParseContext ctx, String tableAlias) { + Table tbl = null; + + if (null != ctx && + null != ctx.getQB() && + null != ctx.getQB().getMetaData() && + null != ctx.getQB().getMetaData().getAliasToTable()) { + tbl = ctx.getQB().getMetaData().getTableForAlias(tableAlias); + } + + return tbl; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (working copy) @@ -21,7 +21,6 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -43,6 +42,7 @@ import org.apache.hadoop.hive.ql.parse.SplitSample; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim; import org.apache.hadoop.hive.shims.HadoopShims.InputSplitShim; import org.apache.hadoop.hive.shims.ShimLoader; @@ -224,10 +224,10 @@ // Splits are not shared across different partitions with different input formats. // For example, 2 partitions (1 sequencefile and 1 rcfile) will have 2 different splits private static class CombinePathInputFormat { - private final List> opList; + private final List> opList; private final String inputFormatClassName; - public CombinePathInputFormat(List> opList, + public CombinePathInputFormat(List> opList, String inputFormatClassName) { this.opList = opList; this.inputFormatClassName = inputFormatClassName; @@ -259,7 +259,7 @@ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { init(job); Map> pathToAliases = mrwork.getPathToAliases(); - Map> aliasToWork = + Map> aliasToWork = mrwork.getAliasToWork(); CombineFileInputFormatShim combine = ShimLoader.getHadoopShims() .getCombineFileInputFormat(); @@ -341,7 +341,7 @@ // Does a pool exist for this path already CombineFilter f = null; - List> opList = null; + List> opList = null; boolean done = false; if (!mrwork.isMapperCannotSpanPartns()) { Index: ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (working copy) @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.io; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -31,15 +30,16 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; +import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputFormat; @@ -385,11 +385,11 @@ * @param aliasToWork The operator tree to be invoked for a given alias * @param dir The path to look for **/ - public static List> doGetWorksFromPath( + public static List> doGetWorksFromPath( Map> pathToAliases, - Map> aliasToWork, Path dir) { - List> opList = - new ArrayList>(); + Map> aliasToWork, Path dir) { + List> opList = + new ArrayList>(); List aliases = doGetAliasesFromPath(pathToAliases, dir); for (String alias : aliases) { Index: ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.java (working copy) @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.index.compact; -import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -63,8 +62,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; public class CompactIndexHandler extends TableBasedIndexHandler { @@ -252,9 +250,11 @@ * @param operators * @return whether or not it has found its target */ - private boolean findIndexColumnFilter(Collection> operators) { - for (Operator op : operators) { - if (op instanceof FilterOperator && ((FilterOperator)op).getConf().getPredicate().getChildren() != null) { + private boolean findIndexColumnFilter( + Collection> operators) { + for (Operator op : operators) { + if (op instanceof FilterOperator && + ((FilterOperator)op).getConf().getPredicate().getChildren() != null) { // Is this the target if (findIndexColumnExprNodeDesc(((FilterOperator)op).getConf().getPredicate())) { ((FilterOperator)op).getConf().setSortedFilter(true); Index: ql/src/java/org/apache/hadoop/hive/ql/Driver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/Driver.java (revision 1377741) +++ ql/src/java/org/apache/hadoop/hive/ql/Driver.java (working copy) @@ -99,6 +99,7 @@ import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.ql.util.SerializableCloneable; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.mapred.ClusterStatus; @@ -583,9 +584,9 @@ ParseContext parseCtx = querySem.getParseContext(); Map tsoTopMap = parseCtx.getTopToTable(); - for (Map.Entry> topOpMap : querySem + for (Map.Entry> topOpMap : querySem .getParseContext().getTopOps().entrySet()) { - Operator topOp = topOpMap.getValue(); + Operator topOp = topOpMap.getValue(); if (topOp instanceof TableScanOperator && tsoTopMap.containsKey(topOp)) { TableScanOperator tableScanOp = (TableScanOperator) topOp;