Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1447702) +++ conf/hive-default.xml.template (working copy) @@ -944,7 +944,7 @@ hive.auto.convert.sortmerge.join.bigtable.selection.policy - org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ + org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeBasedBigTableSelectorForAutoSMJ The policy to choose the big table for automatic conversion to sort-merge join. By default, the leftmost table is assigned the big table. Other policies are based on size: . based on total size (all the partitions selected in the query) of the table @@ -956,6 +956,20 @@ + hive.auto.convert.sortmerge.join.noconditionaltask + true + If hive.auto.convert.sortmerge.join is set to true, and a join was converted to a sort-merge join, + this parameter decides whether each table should be tried as a big table, and effectviely a map-join should be + tried. That would create a conditional task with n+1 children for a n-way join (1 child for each table as the + big table), and the backup task will be the sort-merge join. In some casess, a map-join would be faster than a + sort-merge join, if there is no advantage of having the output bucketed and sorted. For eg. if a very big sorted + and bucketed table with few files (say 10 files) are being joined with a very small sorter and bucketed table + with few files (10 files), the sort-merge join will only use 10 mappers, and a simple map-only join might be faster + if the complete small table can fit in memory, and a map-join can be performed. + + + + hive.metastore.ds.connection.url.hook Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1447702) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -503,7 +503,9 @@ HIVE_AUTO_SORTMERGE_JOIN("hive.auto.convert.sortmerge.join", false), HIVE_AUTO_SORTMERGE_JOIN_BIGTABLE_SELECTOR( "hive.auto.convert.sortmerge.join.bigtable.selection.policy", - "org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ"), + "org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeBasedBigTableSelectorForAutoSMJ"), + HIVE_AUTO_SORTMERGE_JOIN_NOCONDITIONALTASK( + "hive.auto.convert.sortmerge.join.noconditionaltask", true), HIVESCRIPTOPERATORTRUST("hive.exec.script.trust", false), HIVEROWOFFSET("hive.exec.rowoffset", false), Index: ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (working copy) @@ -824,3 +824,1046 @@ POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] 2654 +PREHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on a different key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on a different key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME src) c) (= (. (TOK_TABLE_OR_COL c) value) (. (TOK_TABLE_OR_COL a) value)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {value} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col1 + Position of Big Table: 0 + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 0 + c + TableScan + alias: c + Reduce Output Operator + key expressions: + expr: value + type: string + sort order: + + Map-reduce partition columns: + expr: value + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +2654 +PREHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on the same key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on the same key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME src) c) (= (. (TOK_TABLE_OR_COL c) key) (. (TOK_TABLE_OR_COL a) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-8 is a root stage , consists of Stage-9, Stage-10, Stage-11, Stage-1 + Stage-9 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-9 + Stage-2 depends on stages: Stage-1, Stage-5, Stage-6, Stage-7 + Stage-10 has a backup stage: Stage-1 + Stage-6 depends on stages: Stage-10 + Stage-11 has a backup stage: Stage-1 + Stage-7 depends on stages: Stage-11 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-8 + Conditional Operator + + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 0 + c + TableScan + alias: c + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 0 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-10 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 1 + c + TableScan + alias: c + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 1 + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-11 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 2 + b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 2 + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + c + TableScan + alias: c + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()] + Position of Big Table: 2 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Reduce Output Operator + key expressions: + expr: UDFToDouble(key) + type: double + sort order: + + Map-reduce partition columns: + expr: UDFToDouble(key) + type: double + tag: 0 + b + TableScan + alias: b + Reduce Output Operator + key expressions: + expr: UDFToDouble(key) + type: double + sort order: + + Map-reduce partition columns: + expr: UDFToDouble(key) + type: double + tag: 1 + c + TableScan + alias: c + Reduce Output Operator + key expressions: + expr: UDFToDouble(key) + type: double + sort order: + + Map-reduce partition columns: + expr: UDFToDouble(key) + type: double + tag: 2 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +2654 +PREHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on the same key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on the same key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME tbl3) c) (= (. (TOK_TABLE_OR_COL c) key) (. (TOK_TABLE_OR_COL a) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-7 is a root stage , consists of Stage-8, Stage-9, Stage-10, Stage-1 + Stage-8 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-8 + Stage-9 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-9 + Stage-10 has a backup stage: Stage-1 + Stage-6 depends on stages: Stage-10 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-7 + Conditional Operator + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 0 + c + TableScan + alias: c + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 1 + c + TableScan + alias: c + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-10 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 2 + b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 2 + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: + c + TableScan + alias: c + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 2 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + 2 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +PREHOOK: Input: default@tbl3 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +POSTHOOK: Input: default@tbl3 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +2654 +PREHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on a different key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on a different key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME tbl4) c) (= (. (TOK_TABLE_OR_COL c) value) (. (TOK_TABLE_OR_COL a) value)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {value} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col1 + Position of Big Table: 0 + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 0 + c + TableScan + alias: c + Reduce Output Operator + key expressions: + expr: value + type: string + sort order: + + Map-reduce partition columns: + expr: value + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +PREHOOK: Input: default@tbl4 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +POSTHOOK: Input: default@tbl4 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +2654 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (working copy) @@ -1,10 +1,12 @@ PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket -CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE PREHOOK: type: CREATETABLE POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket -CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE POSTHOOK: type: CREATETABLE POSTHOOK: Output: default@bucket_small PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') @@ -337,6 +339,679 @@ #### A masked pattern was here #### name default.bucket_big numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 +PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 numRows 0 partition_columns ds rawDataSize 0 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (working copy) @@ -330,3 +330,500 @@ POSTHOOK: Input: default@bucket_small #### A masked pattern was here #### 19 +PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +19 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out (working copy) @@ -2234,3 +2234,4534 @@ POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] 20 +PREHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join +explain +select count(*) from ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join +explain +select count(*) from ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq1:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +22 +PREHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join +-- Add a order by at the end to make the results deterministic. +explain +select key, count(*) from +( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +PREHOOK: type: QUERY +POSTHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join +-- Add a order by at the end to make the results deterministic. +explain +select key, count(*) from +( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + subq1:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, count(*) from +( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select key, count(*) from +( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 9 +2 1 +4 1 +5 9 +8 1 +9 1 +PREHOOK: query: -- The join is being performed as part of more than one sub-query. It should be converted to a sort-merge join +explain +select count(*) from +( + select key, count(*) from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The join is being performed as part of more than one sub-query. It should be converted to a sort-merge join +explain +select count(*) from +( + select key, count(*) from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:subq1:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:subq1:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:subq1:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from +( + select key, count(*) from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from +( + select key, count(*) from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +6 +PREHOOK: query: -- A join is being performed across different sub-queries, where a join is being performed in each of them. +-- Each sub-query should be converted to a sort-merge join. +explain +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +order by src1.key, src1.cnt1, src2.cnt1 +PREHOOK: type: QUERY +POSTHOOK: query: -- A join is being performed across different sub-queries, where a join is being performed in each of them. +-- Each sub-query should be converted to a sort-merge join. +explain +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +order by src1.key, src1.cnt1, src2.cnt1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) src1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) src2) (= (. (TOK_TABLE_OR_COL src1) key) (. (TOK_TABLE_OR_COL src2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) cnt1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src2) cnt1))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL src1) key)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL src1) cnt1)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL src2) cnt1))))) + +STAGE DEPENDENCIES: + Stage-12 is a root stage , consists of Stage-18, Stage-19, Stage-1 + Stage-18 has a backup stage: Stage-1 + Stage-10 depends on stages: Stage-18 + Stage-9 depends on stages: Stage-1, Stage-5, Stage-10, Stage-11, Stage-13, Stage-14 , consists of Stage-16, Stage-17, Stage-2 + Stage-16 has a backup stage: Stage-2 + Stage-7 depends on stages: Stage-16 + Stage-3 depends on stages: Stage-2, Stage-7, Stage-8 + Stage-17 has a backup stage: Stage-2 + Stage-8 depends on stages: Stage-17 + Stage-2 + Stage-19 has a backup stage: Stage-1 + Stage-11 depends on stages: Stage-19 + Stage-1 + Stage-15 is a root stage , consists of Stage-20, Stage-21, Stage-5 + Stage-20 has a backup stage: Stage-5 + Stage-13 depends on stages: Stage-20 + Stage-21 has a backup stage: Stage-5 + Stage-14 depends on stages: Stage-21 + Stage-5 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-12 + Conditional Operator + + Stage: Stage-18 + Map Reduce Local Work + Alias -> Map Local Tables: + src1:subq1:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + src1:subq1:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-10 + Map Reduce + Alias -> Map Operator Tree: + src1:subq1:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-9 + Conditional Operator + + Stage: Stage-16 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME1 + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col2 + type: bigint + sort order: +++ + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col2 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-17 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col1} + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0, _col1, _col3 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: 0 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: 1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-19 + Map Reduce Local Work + Alias -> Map Local Tables: + src1:subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + src1:subq1:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-11 + Map Reduce + Alias -> Map Operator Tree: + src1:subq1:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src1:subq1:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-15 + Conditional Operator + + Stage: Stage-20 + Map Reduce Local Work + Alias -> Map Local Tables: + src2:subq2:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + src2:subq2:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-13 + Map Reduce + Alias -> Map Operator Tree: + src2:subq2:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-21 + Map Reduce Local Work + Alias -> Map Local Tables: + src2:subq2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + src2:subq2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-14 + Map Reduce + Alias -> Map Operator Tree: + src2:subq2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + src2:subq2:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +order by src1.key, src1.cnt1, src2.cnt1 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +order by src1.key, src1.cnt1, src2.cnt1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 9 9 +2 1 1 +4 1 1 +5 9 9 +8 1 1 +9 1 1 +PREHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should +-- be converted to a sort-merge join. +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should +-- be converted to a sort-merge join. +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 +PREHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should +-- be converted to a sort-merge join, although there is more than one level of sub-query +explain +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join tbl2 b + on subq2.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should +-- be converted to a sort-merge join, although there is more than one level of sub-query +explain +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join tbl2 b + on subq2.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL subq2) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join tbl2 b + on subq2.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join tbl2 b + on subq2.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 +PREHOOK: query: -- Both the tables are nested sub-queries i.e more then 1 level of sub-query. +-- The join should be converted to a sort-merge join +explain +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq3 + where key < 6 + ) subq4 + on subq2.key = subq4.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Both the tables are nested sub-queries i.e more then 1 level of sub-query. +-- The join should be converted to a sort-merge join +explain +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq3 + where key < 6 + ) subq4 + on subq2.key = subq4.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq3)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq4) (= (. (TOK_TABLE_OR_COL subq2) key) (. (TOK_TABLE_OR_COL subq4) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + subq4:subq3:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq4:subq3:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq4:subq3:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq3 + where key < 6 + ) subq4 + on subq2.key = subq4.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +#### A masked pattern was here #### +Execution failed with exit status: 2 +Obtaining error information + +Task failed! +Task ID: + Stage-7 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq3 + where key < 6 + ) subq4 + on subq2.key = subq4.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 +PREHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters and the join key +-- is not getting modified, it should be converted to a sort-merge join. Note that the sub-query modifies one +-- item, but that is not part of the join key. +explain +select count(*) from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2 + on subq1.key = subq2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters and the join key +-- is not getting modified, it should be converted to a sort-merge join. Note that the sub-query modifies one +-- item, but that is not part of the join key. +explain +select count(*) from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2 + on subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL a) value)) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL a) value)) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 8) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 8) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 8) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 8) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 8) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2 + on subq1.key = subq2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2 + on subq1.key = subq2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 +PREHOOK: query: -- The left table is a sub-query and the right table is not. +-- It should be converted to a sort-merge join. +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join tbl2 a on subq1.key = a.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The left table is a sub-query and the right table is not. +-- It should be converted to a sort-merge join. +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join tbl2 a on subq1.key = a.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (TOK_TABREF (TOK_TABNAME tbl2) a) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL a) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join tbl2 a on subq1.key = a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join tbl2 a on subq1.key = a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 +PREHOOK: query: -- The right table is a sub-query and the left table is not. +-- It should be converted to a sort-merge join. +explain +select count(*) from tbl1 a + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq1 + on a.key = subq1.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The right table is a sub-query and the left table is not. +-- It should be converted to a sort-merge join. +explain +select count(*) from tbl1 a + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq1 + on a.key = subq1.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL subq1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[_col0]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from tbl1 a + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq1 + on a.key = subq1.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tbl1 a + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq1 + on a.key = subq1.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 +PREHOOK: query: -- There are more than 2 inputs to the join, all of them being sub-queries. +-- It should be converted to to a sort-merge join +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on (subq1.key = subq2.key) + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq3 + on (subq1.key = subq3.key) +PREHOOK: type: QUERY +POSTHOOK: query: -- There are more than 2 inputs to the join, all of them being sub-queries. +-- It should be converted to to a sort-merge join +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on (subq1.key = subq2.key) + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq3 + on (subq1.key = subq3.key) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key))) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq3) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq3) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-7 is a root stage , consists of Stage-8, Stage-9, Stage-10, Stage-1 + Stage-8 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-8 + Stage-9 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-9 + Stage-10 has a backup stage: Stage-1 + Stage-6 depends on stages: Stage-10 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-7 + Conditional Operator + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + subq2:a + Fetch Operator + limit: -1 + subq3:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 0 + subq3:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + subq3:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 1 + subq3:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-10 + Map Reduce Local Work + Alias -> Map Local Tables: + subq1:a + Fetch Operator + limit: -1 + subq2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 2 + subq2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 2 + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: + subq3:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 2 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key < 6) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + 2 [Column[_col0]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq3 + on (subq1.key = subq3.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +Execution failed with exit status: 2 +Obtaining error information + +Task failed! +Task ID: + Stage-10 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq3 + on (subq1.key = subq3.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +56 +PREHOOK: query: -- The join is being performed on a nested sub-query, and an aggregation is performed after that. +-- The join should be converted to a sort-merge join +explain +select count(*) from ( + select subq2.key as key, subq2.value as value1, b.value as value2 from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 +join tbl2 b +on subq2.key = b.key) a +PREHOOK: type: QUERY +POSTHOOK: query: -- The join is being performed on a nested sub-query, and an aggregation is performed after that. +-- The join should be converted to a sort-merge join +explain +select count(*) from ( + select subq2.key as key, subq2.value as value1, b.value as value2 from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 +join tbl2 b +on subq2.key = b.key) a +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL subq2) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + a:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a:subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a:subq2:subq1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a:subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:subq2:subq1:a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key < 8) and (key < 6)) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from ( + select subq2.key as key, subq2.value as value1, b.value as value2 from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 +join tbl2 b +on subq2.key = b.key) a +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from ( + select subq2.key as key, subq2.value as value1, b.value as value2 from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 +join tbl2 b +on subq2.key = b.key) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +20 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (working copy) @@ -373,3 +373,625 @@ POSTHOOK: Input: default@bucket_small@ds=2008-04-09 #### A masked pattern was here #### 38 +PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + /bucket_small/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out (working copy) @@ -0,0 +1,776 @@ +PREHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl1 +PREHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl2 +PREHOOK: query: insert overwrite table tbl1 select * from src where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tbl1 +POSTHOOK: query: insert overwrite table tbl1 select * from src where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tbl1 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table tbl2 select * from src where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tbl2 +POSTHOOK: query: insert overwrite table tbl2 select * from src where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tbl2 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE dest1(k1 int, k2 int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(k1 int, k2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE dest2(k1 string, k2 string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest2(k1 string, k2 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest2 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key) key2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key1)) (TOK_SELEXPR (TOK_TABLE_OR_COL key2)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL value1)) (TOK_SELEXPR (TOK_TABLE_OR_COL value2))))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col4, _col5 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col4 + type: int + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col3 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-4 + Stats-Aggr Operator + + +PREHOOK: query: from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +PREHOOK: Output: default@dest1 +PREHOOK: Output: default@dest2 +POSTHOOK: query: from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +POSTHOOK: Output: default@dest1 +POSTHOOK: Output: default@dest2 +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: select * from dest1 order by k1, k2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest1 order by k1, k2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +2 2 +4 4 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +8 8 +9 9 +PREHOOK: query: select * from dest2 order by k1, k2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest2 order by k1, k2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_2 val_2 +val_4 val_4 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_8 val_8 +val_9 val_9 +PREHOOK: query: -- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key) key2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key1)) (TOK_SELEXPR (TOK_TABLE_OR_COL key2)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL value1)) (TOK_SELEXPR (TOK_TABLE_OR_COL value2))))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col4, _col5 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col4 + type: int + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col3 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-4 + Stats-Aggr Operator + + +PREHOOK: query: from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +PREHOOK: Output: default@dest1 +PREHOOK: Output: default@dest2 +POSTHOOK: query: from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +POSTHOOK: Output: default@dest1 +POSTHOOK: Output: default@dest2 +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: select * from dest1 order by k1, k2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest1 order by k1, k2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +2 2 +4 4 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +8 8 +9 9 +PREHOOK: query: select * from dest2 order by k1, k2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest2 order by k1, k2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_2 val_2 +val_4 val_4 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_8 val_8 +val_9 val_9 +PREHOOK: query: -- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +PREHOOK: type: QUERY +POSTHOOK: query: -- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key) key2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key1)) (TOK_SELEXPR (TOK_TABLE_OR_COL key2)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL value1)) (TOK_SELEXPR (TOK_TABLE_OR_COL value2))))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + subq:a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1, _col4, _col5 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col4 + type: int + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + Select Operator + expressions: + expr: _col1 + type: string + expr: _col3 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 + + Stage: Stage-4 + Stats-Aggr Operator + + +PREHOOK: query: from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +PREHOOK: Output: default@dest1 +PREHOOK: Output: default@dest2 +POSTHOOK: query: from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +POSTHOOK: Output: default@dest1 +POSTHOOK: Output: default@dest2 +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: select * from dest1 order by k1, k2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest1 order by k1, k2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +2 2 +4 4 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +5 5 +8 8 +9 9 +PREHOOK: query: select * from dest2 order by k1, k2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: query: select * from dest2 order by k1, k2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest2 +#### A masked pattern was here #### +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_0 val_0 +val_2 val_2 +val_4 val_4 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_5 val_5 +val_8 val_8 +val_9 val_9 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (working copy) @@ -488,3 +488,775 @@ POSTHOOK: Input: default@bucket_small@ds=2008-04-09 #### A masked pattern was here #### 76 +PREHOOK: query: -- The mapjoin should fail resulting in the sort-merge join +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The mapjoin should fail resulting in the sort-merge join +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + /bucket_small/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (working copy) @@ -361,3 +361,625 @@ POSTHOOK: Input: default@bucket_small@ds=2008-04-09 #### A masked pattern was here #### 38 +PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + /bucket_small/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (working copy) @@ -488,3 +488,773 @@ POSTHOOK: Input: default@bucket_small@ds=2008-04-09 #### A masked pattern was here #### 76 +PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + /bucket_small/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (revision 1447702) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (working copy) @@ -261,3 +261,678 @@ POSTHOOK: Input: default@bucket_small@ds=2008-04-08 #### A masked pattern was here #### 38 +PREHOOK: query: -- The mapjoin should fail resulting in the sort-merge join +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- The mapjoin should fail resulting in the sort-merge join +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1 + Stage-6 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-6 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Conditional Operator + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q (working copy) @@ -0,0 +1,82 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 1; + +CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +insert overwrite table tbl1 select * from src where key < 10; +insert overwrite table tbl2 select * from src where key < 10; + +CREATE TABLE dest1(k1 int, k2 int); +CREATE TABLE dest2(k1 string, k2 string); + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.auto.convert.join=true; + +-- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2; + +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2; + +select * from dest1 order by k1, k2; +select * from dest2 order by k1, k2; + +set hive.auto.convert.join.noconditionaltask=true; +set hive.auto.convert.join.noconditionaltask.size=200; + +-- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2; + +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2; + +select * from dest1 order by k1, k2; +select * from dest2 order by k1, k2; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; + +-- A SMB join followed by a mutli-insert +explain +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2; + +from ( + SELECT a.key key1, a.value value1, b.key key2, b.value value2 + FROM tbl1 a JOIN tbl2 b + ON a.key = b.key ) subq +INSERT OVERWRITE TABLE dest1 select key1, key2 +INSERT OVERWRITE TABLE dest2 select value1, value2; + +select * from dest1 order by k1, k2; +select * from dest2 order by k1, k2; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (working copy) @@ -25,3 +25,7 @@ explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (working copy) @@ -30,3 +30,7 @@ explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (working copy) @@ -27,3 +27,7 @@ explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (working copy) @@ -30,3 +30,11 @@ explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +set hive.mapjoin.localtask.max.memory.usage = 0.0001; +set hive.mapjoin.check.memory.rows = 2; + +-- The mapjoin should fail resulting in the sort-merge join +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (working copy) @@ -1,6 +1,7 @@ -- small 1 part, 2 bucket & big 2 part, 4 bucket -CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); @@ -28,3 +29,7 @@ explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (working copy) @@ -21,3 +21,8 @@ explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.join=true; +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q (working copy) @@ -281,3 +281,256 @@ ) subq2 join tbl2 b on subq2.key = b.key) a; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; + +-- The join is being performed as part of sub-query. It should be converted to a sort-merge join +explain +select count(*) from ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1; + +select count(*) from ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1; + +-- The join is being performed as part of sub-query. It should be converted to a sort-merge join +-- Add a order by at the end to make the results deterministic. +explain +select key, count(*) from +( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key; + +select key, count(*) from +( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key; + +-- The join is being performed as part of more than one sub-query. It should be converted to a sort-merge join +explain +select count(*) from +( + select key, count(*) from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2; + +select count(*) from +( + select key, count(*) from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2; + +-- A join is being performed across different sub-queries, where a join is being performed in each of them. +-- Each sub-query should be converted to a sort-merge join. +explain +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +order by src1.key, src1.cnt1, src2.cnt1; + +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +order by src1.key, src1.cnt1, src2.cnt1; + +-- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should +-- be converted to a sort-merge join. +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key; + +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key; + +-- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should +-- be converted to a sort-merge join, although there is more than one level of sub-query +explain +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join tbl2 b + on subq2.key = b.key; + +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join tbl2 b + on subq2.key = b.key; + +-- Both the tables are nested sub-queries i.e more then 1 level of sub-query. +-- The join should be converted to a sort-merge join +explain +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq3 + where key < 6 + ) subq4 + on subq2.key = subq4.key; + +select count(*) from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 + join + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq3 + where key < 6 + ) subq4 + on subq2.key = subq4.key; + +-- The subquery itself is being joined. Since the sub-query only contains selects and filters and the join key +-- is not getting modified, it should be converted to a sort-merge join. Note that the sub-query modifies one +-- item, but that is not part of the join key. +explain +select count(*) from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2 + on subq1.key = subq2.key; + +select count(*) from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2 + on subq1.key = subq2.key; + +-- The left table is a sub-query and the right table is not. +-- It should be converted to a sort-merge join. +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join tbl2 a on subq1.key = a.key; + +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join tbl2 a on subq1.key = a.key; + +-- The right table is a sub-query and the left table is not. +-- It should be converted to a sort-merge join. +explain +select count(*) from tbl1 a + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq1 + on a.key = subq1.key; + +select count(*) from tbl1 a + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq1 + on a.key = subq1.key; + +-- There are more than 2 inputs to the join, all of them being sub-queries. +-- It should be converted to to a sort-merge join +explain +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on (subq1.key = subq2.key) + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq3 + on (subq1.key = subq3.key); + +select count(*) from + (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 + on subq1.key = subq2.key + join + (select a.key as key, a.value as value from tbl2 a where key < 6) subq3 + on (subq1.key = subq3.key); + +-- The join is being performed on a nested sub-query, and an aggregation is performed after that. +-- The join should be converted to a sort-merge join +explain +select count(*) from ( + select subq2.key as key, subq2.value as value1, b.value as value2 from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 +join tbl2 b +on subq2.key = b.key) a; + +select count(*) from ( + select subq2.key as key, subq2.value as value1, b.value as value2 from + ( + select * from + ( + select a.key as key, a.value as value from tbl1 a where key < 8 + ) subq1 + where key < 6 + ) subq2 +join tbl2 b +on subq2.key = b.key) a; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (working copy) @@ -22,3 +22,11 @@ -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; +set hive.mapjoin.localtask.max.memory.usage = 0.0001; +set hive.mapjoin.check.memory.rows = 2; + +-- The mapjoin should fail resulting in the sort-merge join +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (revision 1447702) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (working copy) @@ -34,3 +34,21 @@ -- A SMB join is being followed by a regular join on a bucketed table on a different key explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value; select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value; + +set hive.auto.convert.sortmerge.join.noconditionaltask=false; + +-- A SMB join is being followed by a regular join on a non-bucketed table on a different key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value; +select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value; + +-- A SMB join is being followed by a regular join on a non-bucketed table on the same key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key; +select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key; + +-- A SMB join is being followed by a regular join on a bucketed table on the same key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key; +select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key; + +-- A SMB join is being followed by a regular join on a bucketed table on a different key +explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value; +select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (working copy) @@ -44,6 +44,7 @@ import org.apache.hadoop.hive.ql.exec.OperatorFactory; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.ScriptOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; @@ -74,6 +75,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde.serdeConstants; @@ -220,14 +222,20 @@ } public static String genMapJoinOpAndLocalWork(MapredWork newWork, JoinOperator op, int mapJoinPos) - throws SemanticException { + throws SemanticException { + LinkedHashMap, OpParseContext> opParseCtxMap = + newWork.getOpParseCtxMap(); + QBJoinTree newJoinTree = newWork.getJoinTree(); + // generate the map join operator; already checked the map join + MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op, + newJoinTree, mapJoinPos, true, false); + return genMapJoinOpAndLocalWork(newWork, newMapJoinOp, mapJoinPos); + } + + public static String genMapJoinOpAndLocalWork(MapredWork newWork, MapJoinOperator newMapJoinOp, + int mapJoinPos) + throws SemanticException { try { - LinkedHashMap, OpParseContext> opParseCtxMap = - newWork.getOpParseCtxMap(); - QBJoinTree newJoinTree = newWork.getJoinTree(); - // generate the map join operator; already checked the map join - MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op, - newJoinTree, mapJoinPos, true, false); // generate the local work and return the big table alias String bigTableAlias = MapJoinProcessor .genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos); @@ -502,6 +510,64 @@ return mapJoinOp; } + /** + * convert a sortmerge join to a a map-side join. + * + * @param opParseCtxMap + * @param smbJoinOp + * join operator + * @param joinTree + * qb join tree + * @param mapJoinPos + * position of the source to be read as part of map-reduce framework. All other sources + * are cached in memory + * @param noCheckOuterJoin + */ + public static MapJoinOperator convertSMBJoinToMapJoin( + Map, OpParseContext> opParseCtxMap, + SMBMapJoinOperator smbJoinOp, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) + throws SemanticException { + // Create a new map join operator + SMBJoinDesc smbJoinDesc = smbJoinOp.getConf(); + List keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0)); + TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(PlanUtils + .getFieldSchemasFromColumnList(keyCols, "mapjoinkey")); + MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), + keyTableDesc, smbJoinDesc.getExprs(), + smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), + smbJoinDesc.getOutputColumnNames(), + mapJoinPos, smbJoinDesc.getConds(), + smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix()); + + RowResolver joinRS = opParseCtxMap.get(smbJoinOp).getRowResolver(); + MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild( + mapJoinDesc, joinRS.getRowSchema(), + new ArrayList>()); + + OpParseContext ctx = new OpParseContext(joinRS); + opParseCtxMap.put(mapJoinOp, ctx); + + // change the children of the original join operator to point to the map + // join operator + List> childOps = smbJoinOp.getChildOperators(); + for (Operator childOp : childOps) { + childOp.replaceParent(smbJoinOp, mapJoinOp); + } + mapJoinOp.setChildOperators(childOps); + smbJoinOp.setChildOperators(null); + + // change the parent of the original SMBjoin operator to point to the map + // join operator + List> parentOps = smbJoinOp.getParentOperators(); + for (Operator parentOp : parentOps) { + parentOp.replaceChild(smbJoinOp, mapJoinOp); + } + mapJoinOp.setParentOperators(parentOps); + smbJoinOp.setParentOperators(null); + + return mapJoinOp; + } + public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator op, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { HiveConf hiveConf = pctx.getConf(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (working copy) @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/* + * If a join has been automatically converted into a sort-merge join, create a conditional + * task to try map-side join with each table as the big table. It is similar to + * hive.auto.convert.join, but is only applicable to joins which have been automatically + * converted to sort-merge joins. For hive.auto.convert.join, the backup task is the + * map-reduce join, whereas here, the backup task is the sort-merge join. + * + * Depending on the inputs, a sort-merge join may be faster or slower than the map-side join. + * The other advantage of sort-merge join is that the output is also bucketed and sorted. + * Consider a very big table, say 1TB with 10 buckets being joined with a very small table, say + * 10MB with 10 buckets, the sort-merge join may perform slower since it will be restricted to + * 10 mappers. + */ +public class SortMergeJoinResolver implements PhysicalPlanResolver { + @Override + public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { + + // create dispatcher and graph walker + Dispatcher disp = new SortMergeJoinTaskDispatcher(pctx); + TaskGraphWalker ogw = new TaskGraphWalker(disp); + + // get all the tasks nodes from root task + List topNodes = new ArrayList(); + topNodes.addAll(pctx.rootTasks); + + // begin to walk through the task tree. + ogw.startWalking(topNodes, null); + return pctx; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java (working copy) @@ -0,0 +1,191 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.common.ObjectPair; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.MapredWork; + +/** + * Common iteration methods for converting joins and sort-merge joins. + */ +public abstract class AbstractJoinTaskDispatcher implements Dispatcher { + + protected final PhysicalContext physicalContext; + + public AbstractJoinTaskDispatcher(PhysicalContext context) { + physicalContext = context; + } + + public abstract Task processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException; + + protected void replaceTaskWithConditionalTask( + Task currTask, ConditionalTask cndTsk, + PhysicalContext physicalContext) { + // add this task into task tree + // set all parent tasks + List> parentTasks = currTask.getParentTasks(); + currTask.setParentTasks(null); + if (parentTasks != null) { + for (Task tsk : parentTasks) { + // make new generated task depends on all the parent tasks of current task. + tsk.addDependentTask(cndTsk); + // remove the current task from its original parent task's dependent task + tsk.removeDependentTask(currTask); + } + } else { + // remove from current root task and add conditional task to root tasks + physicalContext.removeFromRootTask(currTask); + physicalContext.addToRootTask(cndTsk); + } + // set all child tasks + List> oldChildTasks = currTask.getChildTasks(); + if (oldChildTasks != null) { + for (Task tsk : cndTsk.getListTasks()) { + if (tsk.equals(currTask)) { + continue; + } + for (Task oldChild : oldChildTasks) { + tsk.addDependentTask(oldChild); + } + } + } + } + + // Replace the task with the new task. Copy the children and parents of the old + // task to the new task. + protected void replaceTask( + Task currTask, Task newTask, + PhysicalContext physicalContext) { + // add this task into task tree + // set all parent tasks + List> parentTasks = currTask.getParentTasks(); + currTask.setParentTasks(null); + if (parentTasks != null) { + for (Task tsk : parentTasks) { + // make new generated task depends on all the parent tasks of current task. + tsk.addDependentTask(newTask); + // remove the current task from its original parent task's dependent task + tsk.removeDependentTask(currTask); + } + } else { + // remove from current root task and add conditional task to root tasks + physicalContext.removeFromRootTask(currTask); + physicalContext.addToRootTask(newTask); + } + + // set all child tasks + List> oldChildTasks = currTask.getChildTasks(); + currTask.setChildTasks(null); + if (oldChildTasks != null) { + for (Task tsk : oldChildTasks) { + // make new generated task depends on all the parent tasks of current task. + newTask.addDependentTask(tsk); + // remove the current task from its original parent task's dependent task + tsk.getParentTasks().remove(currTask); + } + } + } + + public ObjectPair, Long> + getAliasToSize(Context context, MapredWork currWork, + Map> pathToAliases) throws SemanticException { + HashMap aliasToSize = new HashMap(); + + try { + // go over all the input paths, and calculate a known total size, known + // size for each input alias. + Utilities.getInputSummary(context, currWork, null).getLength(); + + // set alias to size mapping, this can be used to determine if one table + // is chosen as big table, what's the total size of left tables, which + // are going to be small tables. + Long aliasTotalKnownInputSize = 0L; + for (Map.Entry> entry : pathToAliases.entrySet()) { + String path = entry.getKey(); + List aliasList = entry.getValue(); + ContentSummary cs = context.getCS(path); + if (cs != null) { + long size = cs.getLength(); + for (String alias : aliasList) { + aliasTotalKnownInputSize += size; + Long es = aliasToSize.get(alias); + if (es == null) { + es = new Long(0); + } + es += size; + aliasToSize.put(alias, es); + } + } + } + return new ObjectPair, Long>(aliasToSize, aliasTotalKnownInputSize); + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + } + + @Override + public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) + throws SemanticException { + if (nodeOutputs == null || nodeOutputs.length == 0) { + throw new SemanticException("No Dispatch Context"); + } + + TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0]; + + Task currTask = (Task) nd; + // not map reduce task or not conditional task, just skip + if (currTask.isMapRedTask()) { + if (currTask instanceof ConditionalTask) { + // get the list of task + List> taskList = ((ConditionalTask) currTask).getListTasks(); + for (Task tsk : taskList) { + if (tsk.isMapRedTask()) { + Task newTask = this.processCurrentTask((MapRedTask) tsk, + ((ConditionalTask) currTask), physicalContext.getContext()); + walkerCtx.addToDispatchList(newTask); + } + } + } else { + Task newTask = + this.processCurrentTask((MapRedTask) currTask, null, physicalContext.getContext()); + walkerCtx.addToDispatchList(newTask); + } + } + return null; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (working copy) @@ -0,0 +1,457 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.ObjectPair; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; +import org.apache.hadoop.hive.ql.plan.ConditionalWork; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.MapredLocalWork; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; + +/* + * Convert tasks involving JOIN into MAPJOIN. + * If hive.auto.convert.join is true, the tasks involving join are converted. + * Consider the query: + * select .... from T1 join T2 on T1.key = T2.key join T3 on T1.key = T3.key + * + * There is a map-reduce task which performs a 3-way join (T1, T2, T3). + * The task would be converted to a conditional task which would have 4 children + * a. Mapjoin considering T1 as the big table + * b. Mapjoin considering T2 as the big table + * c. Mapjoin considering T3 as the big table + * d. Map-reduce join (the original task). + * + * Note that the sizes of all the inputs may not be available at compile time. At runtime, it is + * determined which branch we want to pick up from the above. + * + * However, if hive.auto.convert.join.noconditionaltask is set to true, and + * the sum of any n-1 tables is smaller than hive.auto.convert.join.noconditionaltask.size, + * then a mapjoin is created instead of the conditional task. For the above, if the size of + * T1 + T2 is less than the threshold, then the task is converted to a mapjoin task with T3 as + * the big table. + * + * In this case, further optimization is performed by merging 2 consecutive map-only jobs. + * Consider the query: + * select ... from T1 join T2 on T1.key1 = T2.key1 join T3 on T1.key2 = T3.key2 + * + * Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2) + * followed by another map-reduce job (to perform join of the result with T3). After the + * optimization, both these tasks would be converted to map-only tasks. These 2 map-only jobs + * are then merged into a single map-only job. As a followup (HIVE-3952), it would be possible to + * merge a map-only task with a map-reduce task. + * Consider the query: + * select T1.key2, count(*) from T1 join T2 on T1.key1 = T2.key1 group by T1.key2; + * Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2) + * followed by another map-reduce job (to perform groupby of the result). After the + * optimization, the join task would be converted to map-only tasks. After HIVE-3952, the map-only + * task would be merged with the map-reduce task to create a single map-reduce task. + */ + +/** + * Iterator each tasks. If this task has a local work,create a new task for this local work, named + * MapredLocalTask. then make this new generated task depends on current task's parent task, and + * make current task depends on this new generated task + */ +public class CommonJoinTaskDispatcher extends AbstractJoinTaskDispatcher implements Dispatcher { + + public CommonJoinTaskDispatcher(PhysicalContext context) { + super(context); + } + + // Get the position of the big table for this join operator and the given alias + private int getPosition(MapredWork work, Operator joinOp, + String alias) { + Operator parentOp = work.getAliasToWork().get(alias); + + // reduceSinkOperator's child is null, but joinOperator's parents is reduceSink + while ((parentOp.getChildOperators() != null) && + (!parentOp.getChildOperators().isEmpty())) { + parentOp = parentOp.getChildOperators().get(0); + } + + return joinOp.getParentOperators().indexOf(parentOp); + } + + /* + * A task and its child task has been converted from join to mapjoin. + * See if the two tasks can be merged. + */ + private void mergeMapJoinTaskWithChildMapJoinTask(MapRedTask task) { + MapRedTask childTask = (MapRedTask) task.getChildTasks().get(0); + MapredWork work = task.getWork(); + MapredLocalWork localWork = work.getMapLocalWork(); + MapredWork childWork = childTask.getWork(); + MapredLocalWork childLocalWork = childWork.getMapLocalWork(); + + // Can this be merged + Map> aliasToWork = work.getAliasToWork(); + if (aliasToWork.size() > 1) { + return; + } + + Operator op = aliasToWork.values().iterator().next(); + while (op.getChildOperators() != null) { + // Dont perform this optimization for multi-table inserts + if (op.getChildOperators().size() > 1) { + return; + } + op = op.getChildOperators().get(0); + } + + if (!(op instanceof FileSinkOperator)) { + return; + } + + FileSinkOperator fop = (FileSinkOperator) op; + String workDir = fop.getConf().getDirName(); + + Map> childPathToAliases = childWork.getPathToAliases(); + if (childPathToAliases.size() > 1) { + return; + } + + // The filesink writes to a different directory + if (!childPathToAliases.keySet().iterator().next().equals(workDir)) { + return; + } + + // Either of them should not be bucketed + if ((localWork.getBucketMapjoinContext() != null) || + (childLocalWork.getBucketMapjoinContext() != null)) { + return; + } + + // Merge the trees + if (childWork.getAliasToWork().size() > 1) { + return; + } + + Operator childAliasOp = + childWork.getAliasToWork().values().iterator().next(); + if (fop.getParentOperators().size() > 1) { + return; + } + + // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the + // top of the second + Operator parentFOp = fop.getParentOperators().get(0); + parentFOp.getChildOperators().remove(fop); + parentFOp.getChildOperators().add(childAliasOp); + List> parentOps = + new ArrayList>(); + parentOps.add(parentFOp); + childAliasOp.setParentOperators(parentOps); + + work.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo()); + for (Map.Entry childWorkEntry : childWork.getPathToPartitionInfo() + .entrySet()) { + if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) { + work.getPathToPartitionInfo().put(childWorkEntry.getKey(), childWorkEntry.getValue()); + } + } + + localWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork()); + localWork.getAliasToWork().putAll(childLocalWork.getAliasToWork()); + + // remove the child task + List> oldChildTasks = childTask.getChildTasks(); + task.setChildTasks(oldChildTasks); + if (oldChildTasks != null) { + for (Task oldChildTask : oldChildTasks) { + oldChildTask.getParentTasks().remove(childTask); + oldChildTask.getParentTasks().add(task); + } + } + } + + // create map join task and set big table as bigTablePosition + private ObjectPair convertTaskToMapJoinTask(String xml, + int bigTablePosition) throws UnsupportedEncodingException, SemanticException { + // deep copy a new mapred work from xml + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + // create a mapred task for this work + MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext + .getParseContext().getConf()); + JoinOperator newJoinOp = getJoinOp(newTask); + + // optimize this newWork given the big table position + String bigTableAlias = + MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, bigTablePosition); + return new ObjectPair(newTask, bigTableAlias); + } + + @Override + public Task processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException { + + // whether it contains common join op; if contains, return this common join op + JoinOperator joinOp = getJoinOp(currTask); + if (joinOp == null) { + return null; + } + currTask.setTaskTag(Task.COMMON_JOIN); + + MapredWork currWork = currTask.getWork(); + + // create conditional work list and task list + List listWorks = new ArrayList(); + List> listTasks = new ArrayList>(); + + // create alias to task mapping and alias to input file mapping for resolver + HashMap> aliasToTask = + new HashMap>(); + HashMap> pathToAliases = currWork.getPathToAliases(); + Map> aliasToWork = currWork.getAliasToWork(); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp); + + // start to generate multiple map join tasks + JoinDesc joinDesc = joinOp.getConf(); + Byte[] order = joinDesc.getTagOrder(); + int numAliases = order.length; + + HashMap aliasToSize; + + try { + ObjectPair, Long> aliasToSizePair = + getAliasToSize(context, currWork, pathToAliases); + long aliasTotalKnownInputSize = aliasToSizePair.getSecond(); + aliasToSize = aliasToSizePair.getFirst(); + + HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc + .getConds()); + + // no table could be the big table; there is no need to convert + if (bigTableCandidates == null) { + return null; + } + + Configuration conf = context.getConf(); + + // If sizes of atleast n-1 tables in a n-way join is known, and their sum is smaller than + // the threshold size, convert the join into map-join and don't create a conditional task + boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK); + int bigTablePosition = -1; + if (convertJoinMapJoin) { + // This is the threshold that the user has specified to fit in mapjoin + long mapJoinSize = HiveConf.getLongVar(conf, + HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD); + + boolean bigTableFound = false; + long largestTableSize = 0; + long sumTableSizes = 0; + for (String alias : aliasToWork.keySet()) { + Long size = aliasToSize.get(alias); + // The size is not available at compile time if the input is a sub-query. + // If the size of atleast n-1 inputs for a n-way join are available at compile time, + // and the sum of them is less than the specified threshold, then convert the join + // into a map-join without the conditional task. + if ((size == null) || (size > mapJoinSize)) { + sumTableSizes += largestTableSize; + if (bigTableFound || (sumTableSizes > mapJoinSize)) { + convertJoinMapJoin = false; + break; + } + bigTableFound = true; + bigTablePosition = getPosition(currWork, joinOp, alias); + largestTableSize = mapJoinSize + 1; + } else { + if (size > largestTableSize) { + sumTableSizes += largestTableSize; + largestTableSize = size; + bigTablePosition = getPosition(currWork, joinOp, alias); + } else { + sumTableSizes += size; + } + if (sumTableSizes > mapJoinSize) { + convertJoinMapJoin = false; + break; + } + } + } + } + + String bigTableAlias = null; + currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); + currWork.setJoinTree(joinTree); + String xml = currWork.toXML(); + + if (convertJoinMapJoin) { + // create map join task and set big table as bigTablePosition + MapRedTask newTask = convertTaskToMapJoinTask(xml, bigTablePosition).getFirst(); + + newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP); + replaceTask(currTask, newTask, physicalContext); + + // Can this task be merged with the child task. This can happen if a big table is being + // joined with multiple small tables on different keys + // Further optimizations are possible here, a join which has been converted to a mapjoin + // followed by a mapjoin can be performed in a single MR job. + if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1) + && (newTask.getChildTasks().get(0).getTaskTag() == Task.MAPJOIN_ONLY_NOBACKUP)) { + mergeMapJoinTaskWithChildMapJoinTask(newTask); + } + + return newTask; + } + + long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, + HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); + for (int i = 0; i < numAliases; i++) { + // this table cannot be big table + if (!bigTableCandidates.contains(i)) { + continue; + } + + // create map join task and set big table as i + ObjectPair newTaskAlias = convertTaskToMapJoinTask(xml, i); + MapRedTask newTask = newTaskAlias.getFirst(); + bigTableAlias = newTaskAlias.getSecond(); + + Long aliasKnownSize = aliasToSize.get(bigTableAlias); + if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { + long smallTblTotalKnownSize = aliasTotalKnownInputSize + - aliasKnownSize.longValue(); + if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { + // this table is not good to be a big table. + continue; + } + } + + // add into conditional task + listWorks.add(newTask.getWork()); + listTasks.add(newTask); + newTask.setTaskTag(Task.CONVERTED_MAPJOIN); + + // set up backup task + newTask.setBackupTask(currTask); + newTask.setBackupChildrenTasks(currTask.getChildTasks()); + + // put the mapping alias to task + aliasToTask.put(bigTableAlias, newTask); + } + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + + // insert current common join task to conditional task + listWorks.add(currTask.getWork()); + listTasks.add(currTask); + // clear JoinTree and OP Parse Context + currWork.setOpParseCtxMap(null); + currWork.setJoinTree(null); + + // create conditional task and insert conditional task into task tree + ConditionalWork cndWork = new ConditionalWork(listWorks); + ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); + cndTsk.setListTasks(listTasks); + + // set resolver and resolver context + cndTsk.setResolver(new ConditionalResolverCommonJoin()); + ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); + resolverCtx.setPathToAliases(pathToAliases); + resolverCtx.setAliasToKnownSize(aliasToSize); + resolverCtx.setAliasToTask(aliasToTask); + resolverCtx.setCommonJoinTask(currTask); + resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); + resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); + cndTsk.setResolverCtx(resolverCtx); + + // replace the current task with the new generated conditional task + replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); + return cndTsk; + } + + /* + * If any operator which does not allow map-side conversion is present in the mapper, dont + * convert it into a conditional task. + */ + private boolean checkOperatorOKMapJoinConversion(Operator op) { + if (!op.opAllowedConvertMapJoin()) { + return false; + } + + if (op.getChildOperators() == null) { + return true; + } + + for (Operator childOp : op.getChildOperators()) { + if (!checkOperatorOKMapJoinConversion(childOp)) { + return false; + } + } + + return true; + } + + private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { + MapredWork work = task.getWork(); + if (work == null) { + return null; + } + Operator reducerOp = work.getReducer(); + if (reducerOp instanceof JoinOperator) { + /* Is any operator present, which prevents the conversion */ + Map> aliasToWork = work.getAliasToWork(); + for (Operator op : aliasToWork.values()) { + if (!checkOperatorOKMapJoinConversion(op)) { + return null; + } + } + return (JoinOperator) reducerOp; + } else { + return null; + } + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (working copy) @@ -51,7 +51,15 @@ } if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) { resolvers.add(new CommonJoinResolver()); + + // The joins have been automatically converted to map-joins. + // However, if the joins were converted to sort-merge joins automatically, + // they should also be tried as map-joins. + if (!hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_AUTO_SORTMERGE_JOIN_NOCONDITIONALTASK)) { + resolvers.add(new SortMergeJoinResolver()); + } } + if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER)) { resolvers.add(new IndexWhereResolver()); } @@ -61,7 +69,7 @@ } // Physical optimizers which follow this need to be careful not to invalidate the inferences - // made by this optimizer. Only optimizers which depend on the results of this one should + // made by this optimizer. Only optimizers which depend on the results of this one should // follow it. if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT)) { resolvers.add(new BucketingSortingInferenceOptimizer()); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (working copy) @@ -0,0 +1,421 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.ObjectPair; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.DummyStoreOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; +import org.apache.hadoop.hive.ql.plan.ConditionalWork; +import org.apache.hadoop.hive.ql.plan.FetchWork; +import org.apache.hadoop.hive.ql.plan.MapredLocalWork; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; + +/** + * Iterator over each task. If the task has a smb join, convert the task to a conditional task. + * The conditional task will first try all mapjoin possibilities, and go the the smb join if the + * mapjoin fails. The smb join will be a backup task for all the mapjoin tasks. + */ +public class SortMergeJoinTaskDispatcher extends AbstractJoinTaskDispatcher implements Dispatcher { + + public SortMergeJoinTaskDispatcher(PhysicalContext context) { + super(context); + } + + // Convert the work in the SMB plan to a regular join + // Note that the operator tree is not fixed, only the path/alias mappings in the + // plan are fixed. The operator tree will still contain the SMBJoinOperator + private void genSMBJoinWork(MapredWork currWork, SMBMapJoinOperator smbJoinOp) { + // Remove the paths which are not part of aliasToPartitionInfo + Map aliasToPartitionInfo = currWork.getAliasToPartnInfo(); + List removePaths = new ArrayList(); + + for (Map.Entry> entry : currWork.getPathToAliases().entrySet()) { + boolean keepPath = false; + for (String alias : entry.getValue()) { + if (aliasToPartitionInfo.containsKey(alias)) { + keepPath = true; + break; + } + } + + // Remove if the path is not present + if (!keepPath) { + removePaths.add(entry.getKey()); + } + } + + List removeAliases = new ArrayList(); + for (String removePath : removePaths) { + removeAliases.addAll(currWork.getPathToAliases().get(removePath)); + currWork.getPathToAliases().remove(removePath); + currWork.getPathToPartitionInfo().remove(removePath); + } + + for (String alias : removeAliases) { + currWork.getAliasToPartnInfo().remove(alias); + currWork.getAliasToWork().remove(alias); + } + + // Get the MapredLocalWork + MapredLocalWork localWork = smbJoinOp.getConf().getLocalWork(); + + for (Map.Entry> entry : localWork.getAliasToWork() + .entrySet()) { + String alias = entry.getKey(); + Operator op = entry.getValue(); + FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias); + + // Add the entry in mapredwork + currWork.getAliasToWork().put(alias, op); + + PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias); + if (fetchWork.getTblDir() != null) { + ArrayList aliases = new ArrayList(); + aliases.add(alias); + currWork.getPathToAliases().put(fetchWork.getTblDir(), aliases); + currWork.getPathToPartitionInfo().put(fetchWork.getTblDir(), partitionInfo); + } + else { + for (String pathDir : fetchWork.getPartDir()) { + ArrayList aliases = new ArrayList(); + aliases.add(alias); + currWork.getPathToAliases().put(pathDir, aliases); + currWork.getPathToPartitionInfo().put(pathDir, partitionInfo); + } + } + } + + // Remove the dummy store operator from the tree + for (Operator parentOp : smbJoinOp.getParentOperators()) { + if (parentOp instanceof DummyStoreOperator) { + Operator grandParentOp = parentOp.getParentOperators().get(0); + smbJoinOp.replaceParent(parentOp, grandParentOp); + grandParentOp.setChildOperators(parentOp.getChildOperators()); + parentOp.setParentOperators(null); + parentOp.setParentOperators(null); + } + } + } + + /* + * Convert the work containing to sort-merge join into a work, as if it had a regular join. + * Note that the operator tree is not changed - is still contains the SMB join, but the + * plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join. + */ + private MapredWork convertSMBWorkToJoinWork(MapredWork currWork, SMBMapJoinOperator oldSMBJoinOp) + throws SemanticException { + try { + String xml = currWork.toXML(); + + // deep copy a new mapred work + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork currJoinWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork); + + // Add the row resolver for the new operator + Map, OpParseContext> opParseContextMap = + physicalContext.getParseContext().getOpParseCtx(); + opParseContextMap.put(newSMBJoinOp, opParseContextMap.get(oldSMBJoinOp)); + // change the newly created map-red plan as if it was a join operator + genSMBJoinWork(currJoinWork, newSMBJoinOp); + return currJoinWork; + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + } + + // create map join task and set big table as bigTablePosition + private ObjectPair convertSMBTaskToMapJoinTask(String xml, + int bigTablePosition, + SMBMapJoinOperator smbJoinOp, + QBJoinTree joinTree) + throws UnsupportedEncodingException, SemanticException { + // deep copy a new mapred work from xml + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + // create a mapred task for this work + MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext + .getParseContext().getConf()); + // generate the map join operator; already checked the map join + MapJoinOperator newMapJoinOp = + getMapJoinOperator(newTask, newWork, smbJoinOp, joinTree, bigTablePosition); + // The reducer needs to be restored - Consider a query like: + // select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + // The reducer contains a groupby, which needs to be restored. + Operator reducer = newWork.getReducer(); + + // create the local work for this plan + String bigTableAlias = + MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newMapJoinOp, bigTablePosition); + + // restore the reducer + newWork.setReducer(reducer); + return new ObjectPair(newTask, bigTableAlias); + } + + @Override + public Task processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException { + + // whether it contains a sort merge join operator + MapredWork currWork = currTask.getWork(); + SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork); + if (originalSMBJoinOp == null) { + return null; + } + + // Only create a map-join if the user explicitly gave a join (without a mapjoin hint) + if (!originalSMBJoinOp.isConvertedAutomaticallySMBJoin()) { + return null; + } + currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + QBJoinTree joinTree = parseCtx.getSmbMapJoinContext().get(originalSMBJoinOp); + + // Convert the work containing to sort-merge join into a work, as if it had a regular join. + // Note that the operator tree is not changed - is still contains the SMB join, but the + // plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join. + // This is used to convert the plan to a map-join, and then the original SMB join plan is used + // as a backup task. + MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp); + SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork); + + currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); + currWork.setJoinTree(joinTree); + currJoinWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); + currJoinWork.setJoinTree(joinTree); + + // create conditional work list and task list + List listWorks = new ArrayList(); + List> listTasks = new ArrayList>(); + + // create alias to task mapping and alias to input file mapping for resolver + HashMap> aliasToTask = + new HashMap>(); + // Note that pathToAlias will behave as if the original plan was a join plan + HashMap> pathToAliases = currJoinWork.getPathToAliases(); + + // generate a map join task for the big table + SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf(); + Byte[] order = originalSMBJoinDesc.getTagOrder(); + int numAliases = order.length; + Set bigTableCandidates = + MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds()); + + // no table could be the big table; there is no need to convert + if (bigTableCandidates == null) { + return null; + } + + HashMap aliasToSize; + Configuration conf = context.getConf(); + try { + ObjectPair, Long> aliasToSizePair = + getAliasToSize(context, currJoinWork, pathToAliases); + long aliasTotalKnownInputSize = aliasToSizePair.getSecond(); + aliasToSize = aliasToSizePair.getFirst(); + + String xml = currJoinWork.toXML(); + long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, + HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); + + for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) { + // this table cannot be big table + if (!bigTableCandidates.contains(bigTablePosition)) { + continue; + } + + // create map join task for the given big table position + ObjectPair newTaskAlias = + convertSMBTaskToMapJoinTask(xml, bigTablePosition, newSMBJoinOp, joinTree); + MapRedTask newTask = newTaskAlias.getFirst(); + String bigTableAlias = newTaskAlias.getSecond(); + + Long aliasKnownSize = aliasToSize.get(bigTableAlias); + if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { + long smallTblTotalKnownSize = aliasTotalKnownInputSize + - aliasKnownSize.longValue(); + if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { + // this table is not good to be a big table. + continue; + } + } + + // add into conditional task + listWorks.add(newTask.getWork()); + listTasks.add(newTask); + newTask.setTaskTag(Task.CONVERTED_MAPJOIN); + + // set up backup task + newTask.setBackupTask(currTask); + newTask.setBackupChildrenTasks(currTask.getChildTasks()); + + // put the mapping alias to task + aliasToTask.put(bigTableAlias, newTask); + } + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + + // insert current common join task to conditional task + listWorks.add(currTask.getWork()); + listTasks.add(currTask); + // clear JoinTree and OP Parse Context + currWork.setOpParseCtxMap(null); + currWork.setJoinTree(null); + + // create conditional task and insert conditional task into task tree + ConditionalWork cndWork = new ConditionalWork(listWorks); + ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); + cndTsk.setListTasks(listTasks); + + // set resolver and resolver context + cndTsk.setResolver(new ConditionalResolverCommonJoin()); + ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); + resolverCtx.setPathToAliases(pathToAliases); + resolverCtx.setAliasToKnownSize(aliasToSize); + resolverCtx.setAliasToTask(aliasToTask); + resolverCtx.setCommonJoinTask(currTask); + resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); + resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); + cndTsk.setResolverCtx(resolverCtx); + + // replace the current task with the new generated conditional task + replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); + return cndTsk; + } + + /** + * If a join/union is followed by a SMB join, this cannot be converted to a conditional task. + */ + private boolean reducerAllowedSMBJoinOp(Operator reducer) { + while (reducer != null) { + if (!reducer.opAllowedConvertSortMergeJoin()) { + return false; + } + + List> childOps = reducer.getChildOperators(); + if ((childOps == null) || (childOps.isEmpty())) { + return true; + } + + // multi-table inserts not supported + if (childOps.size() > 1) { + return false; + } + reducer = childOps.get(0); + } + + return true; + } + + private SMBMapJoinOperator getSMBMapJoinOp(Operator currOp, + Operator reducer) { + SMBMapJoinOperator ret = null; + while (true) { + if (currOp instanceof SMBMapJoinOperator) { + if (ret != null) { + return null; + } + ret = (SMBMapJoinOperator) currOp; + } + + // Does any operator in the tree stop the task from being converted to a conditional task + if (!currOp.opAllowedConvertSortMergeJoin()) { + return null; + } + + List> childOps = currOp.getChildOperators(); + if ((childOps == null) || (childOps.isEmpty())) { + return reducerAllowedSMBJoinOp(reducer) ? ret : null; + } + + // multi-table inserts not supported + if (childOps.size() > 1) { + return null; + } + currOp = childOps.get(0); + } + } + + private SMBMapJoinOperator getSMBMapJoinOp(MapredWork work) throws SemanticException { + if (work != null) { + Operator reducer = work.getReducer(); + for (Operator op : work.getAliasToWork().values()) { + SMBMapJoinOperator smbMapJoinOp = getSMBMapJoinOp(op, reducer); + if (smbMapJoinOp != null) { + return smbMapJoinOp; + } + } + } + return null; + } + + private MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork work, + SMBMapJoinOperator oldSMBJoinOp, + QBJoinTree joinTree, + int mapJoinPos) throws SemanticException { + SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(task.getWork()); + + // Add the row resolver for the new operator + Map, OpParseContext> opParseContextMap = + physicalContext.getParseContext().getOpParseCtx(); + opParseContextMap.put(newSMBJoinOp, opParseContextMap.get(oldSMBJoinOp)); + + // generate the map join operator + return MapJoinProcessor.convertSMBJoinToMapJoin(opParseContextMap, newSMBJoinOp, + joinTree, mapJoinPos, true); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (working copy) @@ -17,46 +17,13 @@ */ package org.apache.hadoop.hive.ql.optimizer.physical; -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.io.Serializable; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; import java.util.List; -import java.util.Map; -import java.util.Stack; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.ContentSummary; -import org.apache.hadoop.hive.common.ObjectPair; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.Context; -import org.apache.hadoop.hive.ql.exec.ConditionalTask; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator; -import org.apache.hadoop.hive.ql.exec.JoinOperator; -import org.apache.hadoop.hive.ql.exec.MapRedTask; -import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.TaskFactory; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; -import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext; -import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; -import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; -import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; -import org.apache.hadoop.hive.ql.plan.ConditionalWork; -import org.apache.hadoop.hive.ql.plan.JoinDesc; -import org.apache.hadoop.hive.ql.plan.MapredLocalWork; -import org.apache.hadoop.hive.ql.plan.MapredWork; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.PartitionDesc; /* * Convert tasks involving JOIN into MAPJOIN. @@ -105,494 +72,11 @@ TaskGraphWalker ogw = new TaskGraphWalker(disp); // get all the tasks nodes from root task - ArrayList topNodes = new ArrayList(); + List topNodes = new ArrayList(); topNodes.addAll(pctx.rootTasks); // begin to walk through the task tree. ogw.startWalking(topNodes, null); return pctx; } - - /** - * Iterator each tasks. If this task has a local work,create a new task for this local work, named - * MapredLocalTask. then make this new generated task depends on current task's parent task, and - * make current task depends on this new generated task - */ - class CommonJoinTaskDispatcher implements Dispatcher { - - private final PhysicalContext physicalContext; - - public CommonJoinTaskDispatcher(PhysicalContext context) { - super(); - physicalContext = context; - } - - // Get the position of the big table for this join operator and the given alias - private int getPosition(MapredWork work, Operator joinOp, - String alias) { - Operator parentOp = work.getAliasToWork().get(alias); - - // reduceSinkOperator's child is null, but joinOperator's parents is reduceSink - while ((parentOp.getChildOperators() != null) && - (!parentOp.getChildOperators().isEmpty())) { - parentOp = parentOp.getChildOperators().get(0); - } - - return joinOp.getParentOperators().indexOf(parentOp); - } - - /* - * A task and its child task has been converted from join to mapjoin. - * See if the two tasks can be merged. - */ - private void mergeMapJoinTaskWithChildMapJoinTask(MapRedTask task) { - MapRedTask childTask = (MapRedTask)task.getChildTasks().get(0); - MapredWork work = task.getWork(); - MapredLocalWork localWork = work.getMapLocalWork(); - MapredWork childWork = childTask.getWork(); - MapredLocalWork childLocalWork = childWork.getMapLocalWork(); - - // Can this be merged - Map> aliasToWork = work.getAliasToWork(); - if (aliasToWork.size() > 1) { - return; - } - - Operator op = aliasToWork.values().iterator().next(); - while (op.getChildOperators() != null) { - // Dont perform this optimization for multi-table inserts - if (op.getChildOperators().size() > 1) { - return; - } - op = op.getChildOperators().get(0); - } - - if (!(op instanceof FileSinkOperator)) { - return; - } - - FileSinkOperator fop = (FileSinkOperator)op; - String workDir = fop.getConf().getDirName(); - - Map> childPathToAliases = childWork.getPathToAliases(); - if (childPathToAliases.size() > 1) { - return; - } - - // The filesink writes to a different directory - if (!childPathToAliases.keySet().iterator().next().equals(workDir)) { - return; - } - - // Either of them should not be bucketed - if ((localWork.getBucketMapjoinContext() != null) || - (childLocalWork.getBucketMapjoinContext() != null)) { - return; - } - - // Merge the trees - if (childWork.getAliasToWork().size() > 1) { - return; - } - - Operator childAliasOp = - childWork.getAliasToWork().values().iterator().next(); - if (fop.getParentOperators().size() > 1) { - return; - } - - // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the - // top of the second - Operator parentFOp = fop.getParentOperators().get(0); - parentFOp.getChildOperators().remove(fop); - parentFOp.getChildOperators().add(childAliasOp); - List> parentOps = - new ArrayList>(); - parentOps.add(parentFOp); - childAliasOp.setParentOperators(parentOps); - - work.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo()); - for (Map.Entry childWorkEntry : - childWork.getPathToPartitionInfo().entrySet()) { - if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) { - work.getPathToPartitionInfo().put(childWorkEntry.getKey(), childWorkEntry.getValue()); - } - } - - localWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork()); - localWork.getAliasToWork().putAll(childLocalWork.getAliasToWork()); - - // remove the child task - List> oldChildTasks = childTask.getChildTasks(); - task.setChildTasks(oldChildTasks); - if (oldChildTasks != null) { - for (Task oldChildTask : oldChildTasks) { - oldChildTask.getParentTasks().remove(childTask); - oldChildTask.getParentTasks().add(task); - } - } - } - - // create map join task and set big table as bigTablePosition - private ObjectPair convertTaskToMapJoinTask(String xml, - int bigTablePosition) throws UnsupportedEncodingException, SemanticException { - // deep copy a new mapred work from xml - InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); - MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); - // create a mapred task for this work - MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext - .getParseContext().getConf()); - JoinOperator newJoinOp = getJoinOp(newTask); - - // optimize this newWork and assume big table position is i - String bigTableAlias = - MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, bigTablePosition); - return new ObjectPair(newTask, bigTableAlias); - } - - private Task processCurrentTask(MapRedTask currTask, - ConditionalTask conditionalTask, Context context) - throws SemanticException { - - // whether it contains common join op; if contains, return this common join op - JoinOperator joinOp = getJoinOp(currTask); - if (joinOp == null) { - return null; - } - currTask.setTaskTag(Task.COMMON_JOIN); - - MapredWork currWork = currTask.getWork(); - - // create conditional work list and task list - List listWorks = new ArrayList(); - List> listTasks = new ArrayList>(); - - // create alias to task mapping and alias to input file mapping for resolver - HashMap> aliasToTask = new HashMap>(); - HashMap> pathToAliases = currWork.getPathToAliases(); - Map> aliasToWork = currWork.getAliasToWork(); - - // get parseCtx for this Join Operator - ParseContext parseCtx = physicalContext.getParseContext(); - QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp); - - // start to generate multiple map join tasks - JoinDesc joinDesc = joinOp.getConf(); - Byte[] order = joinDesc.getTagOrder(); - int numAliases = order.length; - - long aliasTotalKnownInputSize = 0; - HashMap aliasToSize = new HashMap(); - try { - // go over all the input paths, and calculate a known total size, known - // size for each input alias. - Utilities.getInputSummary(context, currWork, null).getLength(); - - // set alias to size mapping, this can be used to determine if one table - // is choosen as big table, what's the total size of left tables, which - // are going to be small tables. - for (Map.Entry> entry : pathToAliases.entrySet()) { - String path = entry.getKey(); - List aliasList = entry.getValue(); - ContentSummary cs = context.getCS(path); - if (cs != null) { - long size = cs.getLength(); - for (String alias : aliasList) { - aliasTotalKnownInputSize += size; - Long es = aliasToSize.get(alias); - if (es == null) { - es = new Long(0); - } - es += size; - aliasToSize.put(alias, es); - } - } - } - - HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds()); - - // no table could be the big table; there is no need to convert - if (bigTableCandidates == null) { - return null; - } - - Configuration conf = context.getConf(); - - // If sizes of atleast n-1 tables in a n-way join is known, and their sum is smaller than - // the threshold size, convert the join into map-join and don't create a conditional task - boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, - HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK); - int bigTablePosition = -1; - if (convertJoinMapJoin) { - // This is the threshold that the user has specified to fit in mapjoin - long mapJoinSize = HiveConf.getLongVar(conf, - HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD); - - boolean bigTableFound = false; - long largestTableSize = 0; - long sumTableSizes = 0; - for (String alias : aliasToWork.keySet()) { - Long size = aliasToSize.get(alias); - // The size is not available at compile time if the input is a sub-query. - // If the size of atleast n-1 inputs for a n-way join are available at compile time, - // and the sum of them is less than the specified threshold, then convert the join - // into a map-join without the conditional task. - if ((size == null) || (size > mapJoinSize)) { - sumTableSizes += largestTableSize; - if (bigTableFound || (sumTableSizes > mapJoinSize)) { - convertJoinMapJoin = false; - break; - } - bigTableFound = true; - bigTablePosition = getPosition(currWork, joinOp, alias); - largestTableSize = mapJoinSize + 1; - } else { - if (size > largestTableSize) { - sumTableSizes += largestTableSize; - largestTableSize = size; - bigTablePosition = getPosition(currWork, joinOp, alias); - } else { - sumTableSizes += size; - } - if (sumTableSizes > mapJoinSize) { - convertJoinMapJoin = false; - break; - } - } - } - } - - String bigTableAlias = null; - currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); - currWork.setJoinTree(joinTree); - String xml = currWork.toXML(); - - if (convertJoinMapJoin) { - // create map join task and set big table as bigTablePosition - MapRedTask newTask = convertTaskToMapJoinTask(xml, bigTablePosition).getFirst(); - - newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP); - replaceTask(currTask, newTask, physicalContext); - - // Can this task be merged with the child task. This can happen if a big table is being - // joined with multiple small tables on different keys - // Further optimizations are possible here, a join which has been converted to a mapjoin - // followed by a mapjoin can be performed in a single MR job. - if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1) - && (newTask.getChildTasks().get(0).getTaskTag() == Task.MAPJOIN_ONLY_NOBACKUP)) { - mergeMapJoinTaskWithChildMapJoinTask(newTask); - } - - return newTask; - } - - long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, - HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); - for (int i = 0; i < numAliases; i++) { - // this table cannot be big table - if (!bigTableCandidates.contains(i)) { - continue; - } - - // create map join task and set big table as i - ObjectPair newTaskAlias = convertTaskToMapJoinTask(xml, i); - MapRedTask newTask = newTaskAlias.getFirst(); - bigTableAlias = newTaskAlias.getSecond(); - - Long aliasKnownSize = aliasToSize.get(bigTableAlias); - if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { - long smallTblTotalKnownSize = aliasTotalKnownInputSize - - aliasKnownSize.longValue(); - if(smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { - //this table is not good to be a big table. - continue; - } - } - - // add into conditional task - listWorks.add(newTask.getWork()); - listTasks.add(newTask); - newTask.setTaskTag(Task.CONVERTED_MAPJOIN); - - //set up backup task - newTask.setBackupTask(currTask); - newTask.setBackupChildrenTasks(currTask.getChildTasks()); - - // put the mapping alias to task - aliasToTask.put(bigTableAlias, newTask); - } - } catch (Exception e) { - e.printStackTrace(); - throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); - } - - // insert current common join task to conditional task - listWorks.add(currTask.getWork()); - listTasks.add(currTask); - // clear JoinTree and OP Parse Context - currWork.setOpParseCtxMap(null); - currWork.setJoinTree(null); - - // create conditional task and insert conditional task into task tree - ConditionalWork cndWork = new ConditionalWork(listWorks); - ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); - cndTsk.setListTasks(listTasks); - - // set resolver and resolver context - cndTsk.setResolver(new ConditionalResolverCommonJoin()); - ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); - resolverCtx.setPathToAliases(pathToAliases); - resolverCtx.setAliasToKnownSize(aliasToSize); - resolverCtx.setAliasToTask(aliasToTask); - resolverCtx.setCommonJoinTask(currTask); - resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); - resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); - cndTsk.setResolverCtx(resolverCtx); - - //replace the current task with the new generated conditional task - this.replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); - return cndTsk; - } - - private void replaceTaskWithConditionalTask( - Task currTask, ConditionalTask cndTsk, - PhysicalContext physicalContext) { - // add this task into task tree - // set all parent tasks - List> parentTasks = currTask.getParentTasks(); - currTask.setParentTasks(null); - if (parentTasks != null) { - for (Task tsk : parentTasks) { - // make new generated task depends on all the parent tasks of current task. - tsk.addDependentTask(cndTsk); - // remove the current task from its original parent task's dependent task - tsk.removeDependentTask(currTask); - } - } else { - // remove from current root task and add conditional task to root tasks - physicalContext.removeFromRootTask(currTask); - physicalContext.addToRootTask(cndTsk); - } - // set all child tasks - List> oldChildTasks = currTask.getChildTasks(); - if (oldChildTasks != null) { - for (Task tsk : cndTsk.getListTasks()) { - if (tsk.equals(currTask)) { - continue; - } - for (Task oldChild : oldChildTasks) { - tsk.addDependentTask(oldChild); - } - } - } - } - - // Replace the task with the new task. Copy the children and parents of the old - // task to the new task. - private void replaceTask( - Task currTask, Task newTask, - PhysicalContext physicalContext) { - // add this task into task tree - // set all parent tasks - List> parentTasks = currTask.getParentTasks(); - currTask.setParentTasks(null); - if (parentTasks != null) { - for (Task tsk : parentTasks) { - // make new generated task depends on all the parent tasks of current task. - tsk.addDependentTask(newTask); - // remove the current task from its original parent task's dependent task - tsk.removeDependentTask(currTask); - } - } else { - // remove from current root task and add conditional task to root tasks - physicalContext.removeFromRootTask(currTask); - physicalContext.addToRootTask(newTask); - } - - // set all child tasks - List> oldChildTasks = currTask.getChildTasks(); - currTask.setChildTasks(null); - if (oldChildTasks != null) { - for (Task tsk : oldChildTasks) { - // make new generated task depends on all the parent tasks of current task. - newTask.addDependentTask(tsk); - // remove the current task from its original parent task's dependent task - tsk.getParentTasks().remove(currTask); - } - } - } - - @Override - public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) - throws SemanticException { - if (nodeOutputs == null || nodeOutputs.length == 0) { - throw new SemanticException("No Dispatch Context"); - } - - TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0]; - - Task currTask = (Task) nd; - // not map reduce task or not conditional task, just skip - if (currTask.isMapRedTask()) { - if (currTask instanceof ConditionalTask) { - // get the list of task - List> taskList = ((ConditionalTask) currTask).getListTasks(); - for (Task tsk : taskList) { - if (tsk.isMapRedTask()) { - Task newTask = this.processCurrentTask((MapRedTask) tsk, - ((ConditionalTask) currTask), physicalContext.getContext()); - walkerCtx.addToDispatchList(newTask); - } - } - } else { - Task newTask = - this.processCurrentTask((MapRedTask) currTask, null, physicalContext.getContext()); - walkerCtx.addToDispatchList(newTask); - } - } - return null; - } - - /* - * If any operator which does not allow map-side conversion is present in the mapper, dont - * convert it into a conditional task. - */ - private boolean checkOperatorOKMapJoinConversion(Operator op) { - if (!op.opAllowedConvertMapJoin()) { - return false; - } - - if (op.getChildOperators() == null) { - return true; - } - - for (Operator childOp : op.getChildOperators()) { - if (!checkOperatorOKMapJoinConversion(childOp)) { - return false; - } - } - - return true; - } - - private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { - MapredWork work = task.getWork(); - if (work == null) { - return null; - } - Operator reducerOp = work.getReducer(); - if (reducerOp instanceof JoinOperator) { - /* Is any operator present, which prevents the conversion */ - Map> aliasToWork = work.getAliasToWork(); - for (Operator op : aliasToWork.values()) { - if (!checkOperatorOKMapJoinConversion(op)) { - return null; - } - } - return (JoinOperator) reducerOp; - } else { - return null; - } - } - } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (working copy) @@ -274,4 +274,9 @@ // optimizations for now. return false; } + + @Override + public boolean opAllowedConvertSortMergeJoin() { + return false; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java (working copy) @@ -166,5 +166,4 @@ protected boolean hasAnyNulls(AbstractMapJoinKey key) { return key.hasAnyNulls(nullsafes); } - } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (working copy) @@ -80,6 +80,7 @@ // hive.auto.convert.join.noconditionaltask is set to true. No conditional task was // created in case the mapjoin failed. public static final int MAPJOIN_ONLY_NOBACKUP = 6; + public static final int CONVERTED_SORTMERGEJOIN = 7; // Descendants tasks who subscribe feeds from this task protected transient List> feedSubscribers; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/UnionOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/UnionOperator.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/UnionOperator.java (working copy) @@ -163,4 +163,9 @@ public boolean opAllowedAfterMapJoin() { return false; } + + @Override + public boolean opAllowedConvertSortMergeJoin() { + return false; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -1514,4 +1514,13 @@ public boolean opAllowedConvertMapJoin() { return true; } + + /* + * If this task contains a sortmergejoin, it can be converted to a map-join task if this operator + * is present in the mapper. For eg. if a sort-merge join operator is present followed by a + * regular join, it cannot be converted to a auto map-join. + */ + public boolean opAllowedConvertSortMergeJoin() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverCommonJoin.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverCommonJoin.java (revision 1447702) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverCommonJoin.java (working copy) @@ -71,7 +71,7 @@ } public HashMap getAliasToKnownSize() { - return aliasToKnownSize; + return aliasToKnownSize == null ? new HashMap() : aliasToKnownSize; } public void setAliasToKnownSize(HashMap aliasToKnownSize) {