Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_4.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_4.q.out (working copy) @@ -0,0 +1,446 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key2) SORTED BY (key2) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key2) SORTED BY (key2) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the insert is happening on the bucketing position +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the insert is happening on the bucketing position +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +2 2 val_2val_2 1 +4 4 val_4val_4 1 +8 8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +9 9 val_9val_9 1 +PREHOOK: query: DROP TABLE test_table3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_table3 +PREHOOK: Output: default@test_table3 +POSTHOOK: query: DROP TABLE test_table3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_table3 +POSTHOOK: Output: default@test_table3 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation, since the insert is happening on a non-bucketing position +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation, since the insert is happening on a non-bucketing position +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +0 val_0 1 +0 val_0 1 +0 val_0 1 +0 val_0 1 +0 val_0 1 +0 val_0 1 +0 val_0 1 +0 val_0 1 +0 val_0 1 +2 val_2 1 +4 val_4 1 +8 val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +5 val_5 1 +5 val_5 1 +5 val_5 1 +5 val_5 1 +5 val_5 1 +5 val_5 1 +5 val_5 1 +5 val_5 1 +5 val_5 1 +9 val_9 1 +PREHOOK: query: DROP TABLE test_table3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_table3 +PREHOOK: Output: default@test_table3 +POSTHOOK: query: DROP TABLE test_table3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_table3 +POSTHOOK: Output: default@test_table3 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_8.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_8.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_8.q.out (working copy) @@ -0,0 +1,389 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, b.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, b.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col5, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col5 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, b.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, b.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +2 2 val_2val_2 1 +4 4 val_4val_4 1 +8 8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +9 9 val_9val_9 1 +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT b.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT b.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col5, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col5 + type: int + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT b.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT b.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +0 0 val_0val_0 1 +2 2 val_2val_2 1 +4 4 val_4val_4 1 +8 8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +5 5 val_5val_5 1 +9 9 val_9val_9 1 Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_3.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_3.q.out (working copy) @@ -0,0 +1,326 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (value STRING, key INT) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (value STRING, key INT) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- The bucketing positions dont match - although the actual bucketing do. +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.value, x.key from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- The bucketing positions dont match - although the actual bucketing do. +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.value, x.key from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: value + type: string + expr: key + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.value, x.key from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.value, x.key from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select count(*) from test_table2 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +500 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +247 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +253 +PREHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- The bucketing positions dont match - this should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- The bucketing positions dont match - this should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: UDFToInteger(_col1) + type: int + sort order: + + Map-reduce partition columns: + expr: UDFToInteger(_col1) + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + Select Operator + expressions: + expr: _col0 + type: int + expr: UDFToInteger(_col1) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select count(*) from test_table2 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +500 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +500 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +0 Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_7.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_7.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_7.q.out (working copy) @@ -0,0 +1,592 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +and (a.key = 0 or a.key = 5) +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +and (a.key = 0 or a.key = 5) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1')) (or (= (. (TOK_TABLE_OR_COL a) key) 0) (= (. (TOK_TABLE_OR_COL a) key) 5)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = 0) or (key = 5)) + type: boolean + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +and (a.key = 0 or a.key = 5) +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +and (a.key = 0 or a.key = 5) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +PREHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and (key = 0 or key = 5)) a +JOIN +(select key, value from test_table2 where ds = '1' and (key = 0 or key = 5)) b +ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and (key = 0 or key = 5)) a +JOIN +(select key, value from test_table2 where ds = '1' and (key = 0 or key = 5)) b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (and (= (TOK_TABLE_OR_COL ds) '1') (or (= (TOK_TABLE_OR_COL key) 0) (= (TOK_TABLE_OR_COL key) 5)))))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (and (= (TOK_TABLE_OR_COL ds) '1') (or (= (TOK_TABLE_OR_COL key) 0) (= (TOK_TABLE_OR_COL key) 5)))))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:test_table1 + TableScan + alias: test_table1 + Filter Operator + predicate: + expr: ((key = 0) or (key = 5)) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col3) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and (key = 0 or key = 5)) a +JOIN +(select key, value from test_table2 where ds = '1' and (key = 0 or key = 5)) b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and (key = 0 or key = 5)) a +JOIN +(select key, value from test_table2 where ds = '1' and (key = 0 or key = 5)) b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +PREHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and key < 8) a +JOIN +(select key, value from test_table2 where ds = '1' and key < 8) b +ON a.key = b.key +WHERE a.key = 0 or a.key = 5 +PREHOOK: type: QUERY +POSTHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and key < 8) a +JOIN +(select key, value from test_table2 where ds = '1' and key < 8) b +ON a.key = b.key +WHERE a.key = 0 or a.key = 5 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (and (= (TOK_TABLE_OR_COL ds) '1') (< (TOK_TABLE_OR_COL key) 8))))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (and (= (TOK_TABLE_OR_COL ds) '1') (< (TOK_TABLE_OR_COL key) 8))))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (or (= (. (TOK_TABLE_OR_COL a) key) 0) (= (. (TOK_TABLE_OR_COL a) key) 5))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:test_table1 + TableScan + alias: test_table1 + Filter Operator + predicate: + expr: ((key < 8) and ((key = 0) or (key = 5))) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col3) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and key < 8) a +JOIN +(select key, value from test_table2 where ds = '1' and key < 8) b +ON a.key = b.key +WHERE a.key = 0 or a.key = 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and key < 8) a +JOIN +(select key, value from test_table2 where ds = '1' and key < 8) b +ON a.key = b.key +WHERE a.key = 0 or a.key = 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_2.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_2.q.out (working copy) @@ -0,0 +1,1384 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '2') SELECT * where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=2 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '2') SELECT * where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=2 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '2') SELECT * where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=2 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '2') SELECT * where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=2 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +2 val_2val_2 1 +4 val_4val_4 1 +8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +9 val_9val_9 1 +PREHOOK: query: -- Since more than one partition of 'a' (the big table) is being selected, +-- it should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds is not null and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Since more than one partition of 'a' (the big table) is being selected, +-- it should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds is not null and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (TOK_FUNCTION TOK_ISNOTNULL (. (TOK_TABLE_OR_COL a) ds)) (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds is not null and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table1@ds=2 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds is not null and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table1@ds=2 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +2 val_2val_2 1 +2 val_2val_2 1 +4 val_4val_4 1 +4 val_4val_4 1 +8 val_8val_8 1 +8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +9 val_9val_9 1 +9 val_9val_9 1 +PREHOOK: query: -- Since a single partition of the big table ('a') is being selected, it should be a map-only +-- job even though multiple partitions of 'b' are being selected +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds is not null +PREHOOK: type: QUERY +POSTHOOK: query: -- Since a single partition of the big table ('a') is being selected, it should be a map-only +-- job even though multiple partitions of 'b' are being selected +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds is not null +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (TOK_FUNCTION TOK_ISNOTNULL (. (TOK_TABLE_OR_COL b) ds)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds is not null +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Input: default@test_table2@ds=2 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds is not null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Input: default@test_table2@ds=2 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +2 val_2val_2 1 +2 val_2val_2 1 +4 val_4val_4 1 +4 val_4val_4 1 +8 val_8val_8 1 +8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +9 val_9val_9 1 +9 val_9val_9 1 +PREHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:test_table1 + TableScan + alias: test_table1 + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col3) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +2 val_2val_2 1 +4 val_4val_4 1 +8 val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +9 val_9val_9 1 +PREHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.v1, b.v2) +FROM +(select key, concat(value, value) as v1 from test_table1 where ds = '1') a +JOIN +(select key, concat(value, value) as v2 from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.v1, b.v2) +FROM +(select key, concat(value, value) as v1 from test_table1 where ds = '1') a +JOIN +(select key, concat(value, value) as v2 from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL value)) v1)) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL value)) v2)) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) v1) (. (TOK_TABLE_OR_COL b) v2)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:test_table1 + TableScan + alias: test_table1 + Select Operator + expressions: + expr: key + type: int + expr: concat(value, value) + type: string + outputColumnNames: _col0, _col1 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col3) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.v1, b.v2) +FROM +(select key, concat(value, value) as v1 from test_table1 where ds = '1') a +JOIN +(select key, concat(value, value) as v2 from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.v1, b.v2) +FROM +(select key, concat(value, value) as v1 from test_table1 where ds = '1') a +JOIN +(select key, concat(value, value) as v2 from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +0 val_0val_0val_0val_0 1 +2 val_2val_2val_2val_2 1 +4 val_4val_4val_4val_4 1 +8 val_8val_8val_8val_8 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +5 val_5val_5val_5val_5 1 +9 val_9val_9val_9val_9 1 +PREHOOK: query: -- This should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key+a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- This should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key+a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (+ (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL a) key))) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:test_table1 + TableScan + alias: test_table1 + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: (_col0 + _col0) + type: int + expr: concat(_col1, _col3) + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key+a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key+a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key EXPRESSION [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key EXPRESSION [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +4 val_2val_2 1 +8 val_4val_4 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +10 val_5val_5 1 +16 val_8val_8 1 +18 val_9val_9 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key EXPRESSION [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_6.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_6.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_6.q.out (working copy) @@ -0,0 +1,1254 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT key, key+1, value where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT key, key+1, value where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT key, key+1, value where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT key, key+1, value where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the sort-order matches +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key2, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the sort-order matches +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key2, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key2, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key2, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +2 3 val_2val_2 1 +4 5 val_4val_4 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +8 9 val_8val_8 1 +9 10 val_9val_9 1 +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the sort-order matches +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key, subq1.key2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the sort-order matches +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key, subq1.key2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)) value)) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key, subq1.key2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key, subq1.key2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +2 3 val_2val_2 1 +4 5 val_4val_4 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +8 9 val_8val_8 1 +9 10 val_9val_9 1 +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key2, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key2, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: +- + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)) value)) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: +- + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.key, subq2.key2, subq2.value from +( +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.key, subq2.key2, subq2.value from +( +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)) value)) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) value))))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) key2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.key, subq2.key2, subq2.value from +( +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.key, subq2.key2, subq2.value from +( +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +2 3 val_2val_2 1 +4 5 val_4val_4 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +8 9 val_8val_8 1 +9 10 val_9val_9 1 +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)) value)) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key2) k1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key) k2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) value))))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) k2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) k1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +0 1 val_0val_0 1 +2 3 val_2val_2 1 +4 5 val_4val_4 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +5 6 val_5val_5 1 +8 9 val_8val_8 1 +9 10 val_9val_9 1 +PREHOOK: query: CREATE TABLE test_table4 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key DESC, key2 DESC) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table4 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key DESC, key2 DESC) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table4 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table4 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table4 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key2 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key2 SIMPLE [(test_table1)a.FieldSchema(name:key2, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) key2) (. (TOK_TABLE_OR_COL b) key2))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key2)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)) value)) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key2) k1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key) k2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) value))))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table4) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) k2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) k1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {key2} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key], Column[key2]] + 1 [Column[key], Column[key2]] + outputColumnNames: _col0, _col1, _col2, _col8 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: concat(_col2, _col8) + type: string + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: -- + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table4 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table4 + + Stage: Stage-2 + Stats-Aggr Operator + + Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_1.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_1.q.out (working copy) @@ -0,0 +1,525 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select count(*) from test_table2 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +500 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +247 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +253 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT * from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT * from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT * from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT * from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select count(*) from test_table2 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +500 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +247 +PREHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +253 +PREHOOK: query: -- it should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, concat(x.value, x.value) from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +POSTHOOK: query: -- it should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, concat(x.value, x.value) from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL x) value) (. (TOK_TABLE_OR_COL x) value)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: key + type: int + expr: concat(value, value) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: -- it should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key+x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +POSTHOOK: query: -- it should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key+x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (+ (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL x) key))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: (key + key) + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: -- it should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.k1, concat(x.v1, x.v1) from +( +SELECT a.key as k1, a.value as v1 FROM test_table1 a WHERE a.ds = '1' +)x +PREHOOK: type: QUERY +POSTHOOK: query: -- it should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.k1, concat(x.v1, x.v1) from +( +SELECT a.key as k1, a.value as v1 FROM test_table1 a WHERE a.ds = '1' +)x +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(test_table1)a.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) k1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) v1)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL a) ds) '1')))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table2) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) k1)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL x) v1) (. (TOK_TABLE_OR_COL x) v1)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + x:a + TableScan + alias: a + Select Operator + expressions: + expr: key + type: int + expr: concat(value, value) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table2 + + Stage: Stage-2 + Stats-Aggr Operator + + Index: ql/src/test/results/clientpositive/bucketsortoptimize_insert_5.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketsortoptimize_insert_5.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketsortoptimize_insert_5.q.out (working copy) @@ -0,0 +1,424 @@ +PREHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table1 +PREHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table2 +PREHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key desc) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key desc) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_table3 +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table1@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table1@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_table2@ds=1 +POSTHOOK: query: FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_table2@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation, since the sort-order does not match +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +POSTHOOK: query: -- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation, since the sort-order does not match +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME test_table1) a) (TOK_TABREF (TOK_TABNAME test_table2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))) (TOK_WHERE (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '1'))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col6 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col6) + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: - + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +8 val_8val_8 1 +4 val_4val_4 1 +2 val_2val_2 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +9 val_9val_9 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +PREHOOK: query: -- This should be a map-reduce job since the sort order does not match +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- This should be a map-reduce job since the sort order does not match +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_table2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')))) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_table3) (TOK_PARTSPEC (TOK_PARTVAL ds '1')))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL b) value)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:test_table1 + TableScan + alias: test_table1 + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + outputColumnNames: _col0, _col1 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: concat(_col1, _col3) + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: - + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 1 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table3 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table1 +PREHOOK: Input: default@test_table1@ds=1 +PREHOOK: Input: default@test_table2 +PREHOOK: Input: default@test_table2@ds=1 +PREHOOK: Output: default@test_table3@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table1 +POSTHOOK: Input: default@test_table1@ds=1 +POSTHOOK: Input: default@test_table2 +POSTHOOK: Input: default@test_table2@ds=1 +POSTHOOK: Output: default@test_table3@ds=1 +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +8 val_8val_8 1 +4 val_4val_4 1 +2 val_2val_2 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +0 val_0val_0 1 +PREHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table3 +PREHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table3 +POSTHOOK: Input: default@test_table3@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_table2 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)test_table1.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)test_table1.FieldSchema(name:value, type:string, comment:null), (test_table2)test_table2.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).key SIMPLE [(test_table1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: test_table3 PARTITION(ds=1).value EXPRESSION [(test_table1)a.FieldSchema(name:value, type:string, comment:null), (test_table2)b.FieldSchema(name:value, type:string, comment:null), ] +9 val_9val_9 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 +5 val_5val_5 1 Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_2.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_2.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_2.q (working copy) @@ -0,0 +1,141 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy=org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '2') SELECT * where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '2') SELECT * where key < 100; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- Since more than one partition of 'a' (the big table) is being selected, +-- it should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds is not null and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds is not null and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- Since a single partition of the big table ('a') is being selected, it should be a map-only +-- job even though multiple partitions of 'b' are being selected +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds is not null; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds is not null; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.v1, b.v2) +FROM +(select key, concat(value, value) as v1 from test_table1 where ds = '1') a +JOIN +(select key, concat(value, value) as v2 from test_table2 where ds = '1') b +ON a.key = b.key; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.v1, b.v2) +FROM +(select key, concat(value, value) as v1 from test_table1 where ds = '1') a +JOIN +(select key, concat(value, value) as v2 from test_table2 where ds = '1') b +ON a.key = b.key; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- This should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key+a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key+a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_6.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_6.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_6.q (working copy) @@ -0,0 +1,154 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy=org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS; +CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key ASC, key2 DESC) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT key, key+1, value where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT key, key+1, value where key < 100; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the sort-order matches +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key2, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key2, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the sort-order matches +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key, subq1.key2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key, subq1.key2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key2, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1'; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.key, subq2.key2, subq2.value from +( +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.key, subq2.key2, subq2.value from +( +SELECT subq1.key2, subq1.key, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +CREATE TABLE test_table4 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key, key2) SORTED BY (key DESC, key2 DESC) INTO 2 BUCKETS; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table4 PARTITION (ds = '1') +SELECT subq2.k2, subq2.k1, subq2.value from +( +SELECT subq1.key2 as k1, subq1.key as k2, subq1.value from +( +SELECT a.key, a.key2, concat(a.value, b.value) as value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key and a.key2 = b.key2 WHERE a.ds = '1' and b.ds = '1' +)subq1 +)subq2; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q (working copy) @@ -0,0 +1,50 @@ +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (value STRING, key INT) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT *; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- The bucketing positions dont match - although the actual bucketing do. +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.value, x.key from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x; + +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.value, x.key from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x; + +select count(*) from test_table2 where ds = '1'; +select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1'; +select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1'; + +CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- The bucketing positions dont match - this should be a map-reduce operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x; + +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +(SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1')x; + +select count(*) from test_table2 where ds = '1'; +select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1'; +select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1'; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_7.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_7.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_7.q (working copy) @@ -0,0 +1,86 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy=org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +and (a.key = 0 or a.key = 5); + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1' +and (a.key = 0 or a.key = 5); + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and (key = 0 or key = 5)) a +JOIN +(select key, value from test_table2 where ds = '1' and (key = 0 or key = 5)) b +ON a.key = b.key; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and (key = 0 or key = 5)) a +JOIN +(select key, value from test_table2 where ds = '1' and (key = 0 or key = 5)) b +ON a.key = b.key; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- This should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and key < 8) a +JOIN +(select key, value from test_table2 where ds = '1' and key < 8) b +ON a.key = b.key +WHERE a.key = 0 or a.key = 5; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1' and key < 8) a +JOIN +(select key, value from test_table2 where ds = '1' and key < 8) b +ON a.key = b.key +WHERE a.key = 0 or a.key = 5; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_4.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_4.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_4.q (working copy) @@ -0,0 +1,63 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy=org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key2) SORTED BY (key2) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation, since the insert is happening on the bucketing position +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +DROP TABLE test_table3; + +CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation, since the insert is happening on a non-bucketing position +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, a.value +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +DROP TABLE test_table3; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_8.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_8.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_8.q (working copy) @@ -0,0 +1,56 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy=org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table3 (key INT, key2 INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, b.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, b.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT b.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT b.key, a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_1.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_1.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_1.q (working copy) @@ -0,0 +1,76 @@ +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT *; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-only operation +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x; + +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x; + +select count(*) from test_table2 where ds = '1'; +select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1'; +select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1'; + +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT * from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x; + +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT * from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x; + +select count(*) from test_table2 where ds = '1'; +select count(*) from test_table2 tablesample (bucket 1 out of 2) s where ds = '1'; +select count(*) from test_table2 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- it should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key, concat(x.value, x.value) from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x; + +-- it should be a map-reduce job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.key+x.key, x.value from +( +SELECT a.key, a.value FROM test_table1 a WHERE a.ds = '1' +)x; + +-- it should be a map-only job +EXPLAIN +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') +SELECT x.k1, concat(x.v1, x.v1) from +( +SELECT a.key as k1, a.value as v1 FROM test_table1 a WHERE a.ds = '1' +)x; Index: ql/src/test/queries/clientpositive/bucketsortoptimize_insert_5.q =================================================================== --- ql/src/test/queries/clientpositive/bucketsortoptimize_insert_5.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketsortoptimize_insert_5.q (working copy) @@ -0,0 +1,61 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.bigtable.selection.policy=org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ; + +-- Create two bucketed and sorted tables +CREATE TABLE test_table1 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table2 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) +CLUSTERED BY (key) SORTED BY (key desc) INTO 2 BUCKETS; + +FROM src +INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT * where key < 10; + +FROM src +INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT * where key < 100; + +-- Insert data into the bucketed table by selecting from another bucketed table +-- This should be a map-reduce operation, since the sort-order does not match +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM test_table1 a JOIN test_table2 b +ON a.key = b.key WHERE a.ds = '1' and b.ds = '1'; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; + +-- This should be a map-reduce job since the sort order does not match +EXPLAIN +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key; + +INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') +SELECT a.key, concat(a.value, b.value) +FROM +(select key, value from test_table1 where ds = '1') a +JOIN +(select key, value from test_table2 where ds = '1') b +ON a.key = b.key; + +select * from test_table3 tablesample (bucket 1 out of 2) s where ds = '1'; +select * from test_table3 tablesample (bucket 2 out of 2) s where ds = '1'; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java (revision 1467159) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java (working copy) @@ -25,11 +25,8 @@ import java.util.Map; import java.util.Stack; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.common.ObjectPair; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.exec.ExtractOperator; @@ -37,6 +34,7 @@ import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; @@ -56,6 +54,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; /** @@ -64,12 +63,15 @@ * insert overwrite table T1 select * from T2; * where T1 and T2 are bucketized/sorted on the same keys, we don't need a reducer to * enforce bucketing and sorting. + * + * It also optimizes queries of the form: + * insert overwrite table T1 + * select * from T1 join T2 on T1.key = T2.key + * where T1, T2 and T3 are bucketized/sorted on the same key 'key', we don't need a reducer + * to enforce bucketing and sorting */ public class BucketingSortingReduceSinkOptimizer implements Transform { - private static final Log LOG = LogFactory.getLog(BucketingSortingReduceSinkOptimizer.class - .getName()); - public BucketingSortingReduceSinkOptimizer() { } @@ -77,7 +79,6 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { Map opRules = new LinkedHashMap(); - HiveConf conf = pctx.getConf(); // process reduce sink added by hive.enforce.bucketing or hive.enforce.sorting opRules.put(new RuleRegExp("R1", @@ -90,7 +91,7 @@ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); GraphWalker ogw = new DefaultGraphWalker(disp); - // Create a list of topop nodes + // Create a list of top nodes ArrayList topNodes = new ArrayList(); topNodes.addAll(pctx.getTopOps().values()); ogw.startWalking(topNodes, null); @@ -117,7 +118,6 @@ * */ public class BucketSortReduceSinkProcessor implements NodeProcessor { - protected ParseContext pGraphContext; public BucketSortReduceSinkProcessor(ParseContext pGraphContext) { @@ -142,28 +142,33 @@ } // Get the sort positions and sort order for the table - private List> getSortPositions(List tabSortCols, + // The sort order contains whether the sorting is happening ascending or descending + private ObjectPair, List> getSortPositionsOrder( + List tabSortCols, List tabCols) { - List> posns = new ArrayList>(); + List sortPositions = new ArrayList(); + List sortOrders = new ArrayList(); for (Order sortCol : tabSortCols) { int pos = 0; for (FieldSchema tabCol : tabCols) { if (sortCol.getCol().equals(tabCol.getName())) { - posns.add(new ObjectPair(pos, sortCol.getOrder())); + sortPositions.add(pos); + sortOrders.add(sortCol.getOrder()); break; } pos++; } } - return posns; + return new ObjectPair, List>(sortPositions, sortOrders); } - // Return true if the parition is bucketed/sorted by the specified positions + // Return true if the partition is bucketed/sorted by the specified positions // The number of buckets, the sort order should also match along with the // columns which are bucketed/sorted private boolean checkPartition(Partition partition, List bucketPositionsDest, - List> sortPositionsDest, + List sortPositionsDest, + List sortOrderDest, int numBucketsDest) { // The bucketing and sorting positions should exactly match int numBuckets = partition.getBucketCount(); @@ -173,10 +178,11 @@ List partnBucketPositions = getBucketPositions(partition.getBucketCols(), partition.getTable().getCols()); - List> partnSortPositions = - getSortPositions(partition.getSortCols(), partition.getTable().getCols()); + ObjectPair, List> partnSortPositionsOrder = + getSortPositionsOrder(partition.getSortCols(), partition.getTable().getCols()); return bucketPositionsDest.equals(partnBucketPositions) && - sortPositionsDest.equals(partnSortPositions); + sortPositionsDest.equals(partnSortPositionsOrder.getFirst()) && + sortOrderDest.equals(partnSortPositionsOrder.getSecond()); } // Return true if the table is bucketed/sorted by the specified positions @@ -184,7 +190,8 @@ // columns which are bucketed/sorted private boolean checkTable(Table table, List bucketPositionsDest, - List> sortPositionsDest, + List sortPositionsDest, + List sortOrderDest, int numBucketsDest) { // The bucketing and sorting positions should exactly match int numBuckets = table.getNumBuckets(); @@ -194,12 +201,17 @@ List tableBucketPositions = getBucketPositions(table.getBucketCols(), table.getCols()); - List> tableSortPositions = - getSortPositions(table.getSortCols(), table.getCols()); + ObjectPair, List> tableSortPositionsOrder = + getSortPositionsOrder(table.getSortCols(), table.getCols()); return bucketPositionsDest.equals(tableBucketPositions) && - sortPositionsDest.equals(tableSortPositions); + sortPositionsDest.equals(tableSortPositionsOrder.getFirst()) && + sortOrderDest.equals(tableSortPositionsOrder.getSecond()); } + // Store the bucket path to bucket number mapping in the table scan operator. + // Although one mapper per file is used (BucketizedInputHiveInput), it is possible that + // any mapper can pick up any file (depending on the size of the files). The bucket number + // corresponding to the input file is stored to name the output bucket file appropriately. private void storeBucketPathMapping(TableScanOperator tsOp, FileStatus[] srcs) { Map bucketFileNameMapping = new HashMap(); for (int pos = 0; pos < srcs.length; pos++) { @@ -222,12 +234,12 @@ // Store the mapping -> path, bucket number // This is needed since for the map-only job, any mapper can process any file. // For eg: if mapper 1 is processing the file corresponding to bucket 2, it should - // also output the file correspodning to bucket 2 of the output. + // also output the file corresponding to bucket 2 of the output. storeBucketPathMapping(tsOp, srcs); } // Remove the reduce sink operator - // Use bucketized hive input format so that one mapper processes exactly one file + // Use BucketizedHiveInputFormat so that one mapper processes exactly one file private void removeReduceSink(ReduceSinkOperator rsOp, TableScanOperator tsOp, FileSinkOperator fsOp) { @@ -251,6 +263,97 @@ return -1; } + // The output columns for the destination table should match with the join keys + // This is to handle queries of the form: + // insert overwrite table T3 + // select T1.key, T1.key2, UDF(T1.value, T2.value) + // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2 + // where T1, T2 and T3 are bucketized/sorted on key and key2 + // Assuming T1 is the table on which the mapper is run, the following is true: + // . The number of buckets for T1 and T3 should be same + // . The bucketing/sorting columns for T1, T2 and T3 should be same + // . The sort order of T1 should match with the sort order for T3. + // . If T1 is partitioned, only a single partition of T1 can be selected. + // . The select list should contain with (T1.key, T1.key2) or (T2.key, T2.key2) + // . After the join, only selects and filters are allowed. + private boolean validateSMBJoinKeys(SMBJoinDesc smbJoinDesc, + List sourceTableBucketCols, + List sourceTableSortCols, + List sortOrder) { + // The sort-merge join creates the output sorted and bucketized by the same columns. + // This can be relaxed in the future if there is a requirement. + if (!sourceTableBucketCols.equals(sourceTableSortCols)) { + return false; + } + + // Get the total number of columns selected, and for each output column, store the + // base table it points to. For + // insert overwrite table T3 + // select T1.key, T1.key2, UDF(T1.value, T2.value) + // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2 + // the following arrays are created + // [0, 0, 0, 1] --> [T1, T1, T1, T2] (table mapping) + // [0, 1, 2, 0] --> [T1.0, T1.1, T1.2, T2.0] (table columns mapping) + Byte[] tagOrder = smbJoinDesc.getTagOrder(); + Map> retainList = smbJoinDesc.getRetainList(); + int totalNumberColumns = 0; + for (Byte tag : tagOrder) { + totalNumberColumns += retainList.get(tag).size(); + } + + byte[] columnTableMappings = new byte[totalNumberColumns]; + int[] columnNumberMappings = new int[totalNumberColumns]; + int currentColumnPosition = 0; + for (Byte tag : tagOrder) { + for (int pos = 0; pos < retainList.get(tag).size(); pos++) { + columnTableMappings[currentColumnPosition] = tag; + columnNumberMappings[currentColumnPosition] = pos; + currentColumnPosition++; + } + } + + // All output columns used for bucketing/sorting of the destination table should + // belong to the same input table + // insert overwrite table T3 + // select T1.key, T2.key2, UDF(T1.value, T2.value) + // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2 + // is not optimized, whereas the insert is optimized if the select list is either changed to + // (T1.key, T1.key2, UDF(T1.value, T2.value)) or (T2.key, T2.key2, UDF(T1.value, T2.value)) + // Get the input table and make sure the keys match + List outputColumnNames = smbJoinDesc.getOutputColumnNames(); + byte tableTag = -1; + int[] columnNumbersExprList = new int[sourceTableBucketCols.size()]; + int currentColPosition = 0; + for (ExprNodeColumnDesc bucketCol : sourceTableBucketCols) { + String colName = bucketCol.getColumn(); + int colNumber = outputColumnNames.indexOf(colName); + if (colNumber < 0) { + return false; + } + if (tableTag < 0) { + tableTag = columnTableMappings[colNumber]; + } + else if (tableTag != columnTableMappings[colNumber]) { + return false; + } + columnNumbersExprList[currentColPosition++] = columnNumberMappings[colNumber]; + } + + List allExprs = smbJoinDesc.getExprs().get(tableTag); + List keysSelectedTable = smbJoinDesc.getKeys().get(tableTag); + currentColPosition = 0; + for (ExprNodeDesc keySelectedTable : keysSelectedTable) { + if (!(keySelectedTable instanceof ExprNodeColumnDesc)) { + return false; + } + if (!allExprs.get(columnNumbersExprList[currentColPosition++]).isSame(keySelectedTable)) { + return false; + } + } + + return true; + } + @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { @@ -283,14 +386,21 @@ if (destTable == null) { return null; } + int numBucketsDestination = destTable.getNumBuckets(); // Get the positions for sorted and bucketed columns // For sorted columns, also get the order (ascending/descending) - that should // also match for this to be converted to a map-only job. + // Get the positions for sorted and bucketed columns + // For sorted columns, also get the order (ascending/descending) - that should + // also match for this to be converted to a map-only job. List bucketPositions = getBucketPositions(destTable.getBucketCols(), destTable.getCols()); - List> sortPositions = - getSortPositions(destTable.getSortCols(), destTable.getCols()); + ObjectPair, List> sortOrderPositions = + getSortPositionsOrder(destTable.getSortCols(), destTable.getCols()); + List sortPositions = sortOrderPositions.getFirst(); + List sortOrder = sortOrderPositions.getSecond(); + boolean useBucketSortPositions = true; // Only selects and filters are allowed Operator op = rsOp; @@ -298,119 +408,179 @@ // bucketed/sorted columns for the destination table List sourceTableBucketCols = new ArrayList(); List sourceTableSortCols = new ArrayList(); + op = op.getParentOperators().get(0); while (true) { - if (op.getParentOperators().size() > 1) { - return null; - } - - op = op.getParentOperators().get(0); if (!(op instanceof TableScanOperator) && !(op instanceof FilterOperator) && - !(op instanceof SelectOperator)) { + !(op instanceof SelectOperator) && + !(op instanceof SMBMapJoinOperator)) { return null; } - // nothing to be done for filters - the output schema does not change. - if (op instanceof TableScanOperator) { - Table srcTable = pGraphContext.getTopToTable().get(op); + if (op instanceof SMBMapJoinOperator) { + // Bucketing and sorting keys should exactly match + if (!(bucketPositions.equals(sortPositions))) { + return null; + } + SMBMapJoinOperator smbOp = (SMBMapJoinOperator) op; + SMBJoinDesc smbJoinDesc = smbOp.getConf(); + int posBigTable = smbJoinDesc.getPosBigTable(); - // Find the positions of the bucketed columns in the table corresponding - // to the select list. - // Consider the following scenario: - // T1(key, value1, value2) bucketed/sorted by key into 2 buckets - // T2(dummy, key, value1, value2) bucketed/sorted by key into 2 buckets - // A query like: insert overwrite table T2 select 1, key, value1, value2 from T1 - // should be optimized. + // join keys dont match the bucketing keys + List keysBigTable = smbJoinDesc.getKeys().get((byte) posBigTable); + if (keysBigTable.size() != bucketPositions.size()) { + return null; + } - // Start with the destination: T2, bucketed/sorted position is [1] - // At the source T1, the column corresponding to that position is [key], which - // maps to column [0] of T1, which is also bucketed/sorted into the same - // number of buckets - List newBucketPositions = new ArrayList(); - for (int pos = 0; pos < bucketPositions.size(); pos++) { - ExprNodeColumnDesc col = sourceTableBucketCols.get(pos); - String colName = col.getColumn(); - int bucketPos = findColumnPosition(srcTable.getCols(), colName); - if (bucketPos < 0) { - return null; - } - newBucketPositions.add(bucketPos); + if (!validateSMBJoinKeys(smbJoinDesc, sourceTableBucketCols, + sourceTableSortCols, sortOrder)) { + return null; } - // Find the positions/order of the sorted columns in the table corresponding - // to the select list. - List> newSortPositions = - new ArrayList>(); - for (int pos = 0; pos < sortPositions.size(); pos++) { - ExprNodeColumnDesc col = sourceTableSortCols.get(pos); - String colName = col.getColumn(); - int sortPos = findColumnPosition(srcTable.getCols(), colName); - if (sortPos < 0) { + sourceTableBucketCols.clear(); + sourceTableSortCols.clear(); + useBucketSortPositions = false; + + for (ExprNodeDesc keyBigTable : keysBigTable) { + if (!(keyBigTable instanceof ExprNodeColumnDesc)) { return null; } - newSortPositions.add( - new ObjectPair(sortPos, sortPositions.get(pos).getSecond())); + sourceTableBucketCols.add((ExprNodeColumnDesc) keyBigTable); + sourceTableSortCols.add((ExprNodeColumnDesc) keyBigTable); } + // since it is a sort-merge join, only follow the big table + op = op.getParentOperators().get(posBigTable); + } else { + // nothing to be done for filters - the output schema does not change. + if (op instanceof TableScanOperator) { + assert !useBucketSortPositions; + Table srcTable = pGraphContext.getTopToTable().get(op); - if (srcTable.isPartitioned()) { - PrunedPartitionList prunedParts = pGraphContext.getOpToPartList().get(op); - List partitions = prunedParts.getNotDeniedPartns(); + // Find the positions of the bucketed columns in the table corresponding + // to the select list. + // Consider the following scenario: + // T1(key, value1, value2) bucketed/sorted by key into 2 buckets + // T2(dummy, key, value1, value2) bucketed/sorted by key into 2 buckets + // A query like: insert overwrite table T2 select 1, key, value1, value2 from T1 + // should be optimized. - // Support for dynamic partitions can be added later - // The following is not optimized: - // insert overwrite table T1(ds='1', hr) select key, value, hr from T2 where ds = '1'; - // where T1 and T2 are bucketed by the same keys and partitioned by ds. hr - if ((partitions == null) || (partitions.isEmpty()) || (partitions.size() > 1)) { - return null; + // Start with the destination: T2, bucketed/sorted position is [1] + // At the source T1, the column corresponding to that position is [key], which + // maps to column [0] of T1, which is also bucketed/sorted into the same + // number of buckets + List newBucketPositions = new ArrayList(); + for (int pos = 0; pos < bucketPositions.size(); pos++) { + ExprNodeColumnDesc col = sourceTableBucketCols.get(pos); + String colName = col.getColumn(); + int bucketPos = findColumnPosition(srcTable.getCols(), colName); + if (bucketPos < 0) { + return null; + } + newBucketPositions.add(bucketPos); } - for (Partition partition : partitions) { - if (!checkPartition(partition, newBucketPositions, newSortPositions, - pGraphContext.getFsopToTable().get(fsOp).getNumBuckets())) { + + // Find the positions/order of the sorted columns in the table corresponding + // to the select list. + List newSortPositions = new ArrayList(); + for (int pos = 0; pos < sortPositions.size(); pos++) { + ExprNodeColumnDesc col = sourceTableSortCols.get(pos); + String colName = col.getColumn(); + int sortPos = findColumnPosition(srcTable.getCols(), colName); + if (sortPos < 0) { return null; } + newSortPositions.add(sortPos); } - removeReduceSink(rsOp, (TableScanOperator) op, fsOp, - partitions.get(0).getSortedPaths()); - return null; - } - else { - if (!checkTable(srcTable, newBucketPositions, newSortPositions, - pGraphContext.getFsopToTable().get(fsOp).getNumBuckets())) { + if (srcTable.isPartitioned()) { + PrunedPartitionList prunedParts = pGraphContext.getOpToPartList().get(op); + List partitions = prunedParts.getNotDeniedPartns(); + + // Support for dynamic partitions can be added later + // The following is not optimized: + // insert overwrite table T1(ds='1', hr) select key, value, hr from T2 where ds = '1'; + // where T1 and T2 are bucketed by the same keys and partitioned by ds. hr + if ((partitions == null) || (partitions.isEmpty()) || (partitions.size() > 1)) { + return null; + } + for (Partition partition : partitions) { + if (!checkPartition(partition, newBucketPositions, newSortPositions, sortOrder, + numBucketsDestination)) { + return null; + } + } + + removeReduceSink(rsOp, (TableScanOperator) op, fsOp, + partitions.get(0).getSortedPaths()); return null; } + else { + if (!checkTable(srcTable, newBucketPositions, newSortPositions, sortOrder, + numBucketsDestination)) { + return null; + } - removeReduceSink(rsOp, (TableScanOperator) op, fsOp, srcTable.getSortedPaths()); - return null; + removeReduceSink(rsOp, (TableScanOperator) op, fsOp, srcTable.getSortedPaths()); + return null; + } } - } - // None of the operators is changing the positions - else if (op instanceof SelectOperator) { - SelectOperator selectOp = (SelectOperator) op; - SelectDesc selectDesc = selectOp.getConf(); + // None of the operators is changing the positions + else if (op instanceof SelectOperator) { + SelectOperator selectOp = (SelectOperator) op; + SelectDesc selectDesc = selectOp.getConf(); - // There may be multiple selects - chose the one closest to the table - sourceTableBucketCols.clear(); - sourceTableSortCols.clear(); + // Iterate backwards, from the destination table to the top of the tree + // Based on the output column names, get the new columns. + if (!useBucketSortPositions) { + bucketPositions.clear(); + sortPositions.clear(); + List outputColumnNames = selectDesc.getOutputColumnNames(); - // Only columns can be selected for both sorted and bucketed positions - for (int pos : bucketPositions) { - ExprNodeDesc selectColList = selectDesc.getColList().get(pos); - if (!(selectColList instanceof ExprNodeColumnDesc)) { - return null; + for (ExprNodeColumnDesc col : sourceTableBucketCols) { + String colName = col.getColumn(); + int colPos = outputColumnNames.indexOf(colName); + if (colPos < 0) { + return null; + } + bucketPositions.add(colPos); + } + + for (ExprNodeColumnDesc col : sourceTableSortCols) { + String colName = col.getColumn(); + int colPos = outputColumnNames.indexOf(colName); + if (colPos < 0) { + return null; + } + sortPositions.add(colPos); + } } - sourceTableBucketCols.add((ExprNodeColumnDesc) selectColList); - } - for (ObjectPair pos : sortPositions) { - ExprNodeDesc selectColList = selectDesc.getColList().get(pos.getFirst()); - if (!(selectColList instanceof ExprNodeColumnDesc)) { - return null; + // There may be multiple selects - chose the one closest to the table + sourceTableBucketCols.clear(); + sourceTableSortCols.clear(); + + // Only columns can be selected for both sorted and bucketed positions + for (int pos : bucketPositions) { + ExprNodeDesc selectColList = selectDesc.getColList().get(pos); + if (!(selectColList instanceof ExprNodeColumnDesc)) { + return null; + } + sourceTableBucketCols.add((ExprNodeColumnDesc) selectColList); } - sourceTableSortCols.add((ExprNodeColumnDesc) selectColList); + + for (int pos : sortPositions) { + ExprNodeDesc selectColList = selectDesc.getColList().get(pos); + if (!(selectColList instanceof ExprNodeColumnDesc)) { + return null; + } + sourceTableSortCols.add((ExprNodeColumnDesc) selectColList); + } + + useBucketSortPositions = false; } + op = op.getParentOperators().get(0); } } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1467159) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy) @@ -98,10 +98,11 @@ // in the execution context. This is needed for the following scenario: // insert overwrite table T1 select * from T2; // where T1 and T2 are sorted/bucketed by the same keys into the same number of buckets - // Although one mapper per file is used (bucketizedinputhiveinput), it is possible that + // Although one mapper per file is used (BucketizedInputHiveInput), it is possible that // any mapper can pick up any file (depending on the size of the files). The bucket number // corresponding to the input file is stored to name the output bucket file appropriately. - Map bucketNameMapping = conf != null ? conf.getBucketFileNameMapping() : null; + Map bucketNameMapping = + (conf != null) ? conf.getBucketFileNameMapping() : null; if ((bucketNameMapping != null) && (!bucketNameMapping.isEmpty())) { String currentInputFile = getExecContext().getCurrentInputFile(); getExecContext().setFileId(Integer.toString(bucketNameMapping.get(