Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1447595)
+++ conf/hive-default.xml.template (working copy)
@@ -956,6 +956,20 @@
+ hive.auto.convert.sortmerge.join.noconditionaltask
+ true
+ If hive.auto.convert.sortmerge.join is set to true, and a join was converted to a sort-merge join,
+ this parameter decides whether each table should be tried as a big table, and effectviely a map-join should be
+ tried. That would create a conditional task with n+1 children for a n-way join (1 child for each table as the
+ big table), and the backup task will be the sort-merge join. In some casess, a map-join would be faster than a
+ sort-merge join, if there is no advantage of having the output bucketed and sorted. For eg. if a very big sorted
+ and bucketed table with few files (say 10 files) are being joined with a very small sorter and bucketed table
+ with few files (10 files), the sort-merge join will only use 10 mappers, and a simple map-only join might be faster
+ if the complete small table can fit in memory, and a map-join can be performed.
+
+
+
+
hive.metastore.ds.connection.url.hook
Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1447595)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -504,6 +504,8 @@
HIVE_AUTO_SORTMERGE_JOIN_BIGTABLE_SELECTOR(
"hive.auto.convert.sortmerge.join.bigtable.selection.policy",
"org.apache.hadoop.hive.ql.optimizer.LeftmostBigTableSelectorForAutoSMJ"),
+ HIVE_AUTO_SORTMERGE_JOIN_NOCONDITIONALTASK(
+ "hive.auto.convert.sortmerge.join.noconditionaltask", true),
HIVESCRIPTOPERATORTRUST("hive.exec.script.trust", false),
HIVEROWOFFSET("hive.exec.rowoffset", false),
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (working copy)
@@ -824,3 +824,1046 @@
POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
2654
+PREHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on a different key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on a different key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME src) c) (= (. (TOK_TABLE_OR_COL c) value) (. (TOK_TABLE_OR_COL a) value)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {value}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col1
+ Position of Big Table: 0
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 0
+ c
+ TableScan
+ alias: c
+ Reduce Output Operator
+ key expressions:
+ expr: value
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: value
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+2654
+PREHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on the same key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join is being followed by a regular join on a non-bucketed table on the same key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME src) c) (= (. (TOK_TABLE_OR_COL c) key) (. (TOK_TABLE_OR_COL a) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-8 is a root stage , consists of Stage-9, Stage-10, Stage-11, Stage-1
+ Stage-9 has a backup stage: Stage-1
+ Stage-5 depends on stages: Stage-9
+ Stage-2 depends on stages: Stage-1, Stage-5, Stage-6, Stage-7
+ Stage-10 has a backup stage: Stage-1
+ Stage-6 depends on stages: Stage-10
+ Stage-11 has a backup stage: Stage-1
+ Stage-7 depends on stages: Stage-11
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-8
+ Conditional Operator
+
+ Stage: Stage-9
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ c
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 0
+ c
+ TableScan
+ alias: c
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 0
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-10
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ c
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 1
+ c
+ TableScan
+ alias: c
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 1
+
+ Stage: Stage-6
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-11
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 2
+ b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 2
+
+ Stage: Stage-7
+ Map Reduce
+ Alias -> Map Operator Tree:
+ c
+ TableScan
+ alias: c
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 1 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ 2 [class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[key]()]
+ Position of Big Table: 2
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Reduce Output Operator
+ key expressions:
+ expr: UDFToDouble(key)
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: UDFToDouble(key)
+ type: double
+ tag: 0
+ b
+ TableScan
+ alias: b
+ Reduce Output Operator
+ key expressions:
+ expr: UDFToDouble(key)
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: UDFToDouble(key)
+ type: double
+ tag: 1
+ c
+ TableScan
+ alias: c
+ Reduce Output Operator
+ key expressions:
+ expr: UDFToDouble(key)
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: UDFToDouble(key)
+ type: double
+ tag: 2
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+2654
+PREHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on the same key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on the same key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME tbl3) c) (= (. (TOK_TABLE_OR_COL c) key) (. (TOK_TABLE_OR_COL a) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-7 is a root stage , consists of Stage-8, Stage-9, Stage-10, Stage-1
+ Stage-8 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-8
+ Stage-9 has a backup stage: Stage-1
+ Stage-5 depends on stages: Stage-9
+ Stage-10 has a backup stage: Stage-1
+ Stage-6 depends on stages: Stage-10
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-7
+ Conditional Operator
+
+ Stage: Stage-8
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ c
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 0
+ c
+ TableScan
+ alias: c
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-9
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ c
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 1
+ c
+ TableScan
+ alias: c
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-10
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 2
+ b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 2
+
+ Stage: Stage-6
+ Map Reduce
+ Alias -> Map Operator Tree:
+ c
+ TableScan
+ alias: c
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 2
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ 2 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+PREHOOK: Input: default@tbl3
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl3
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+2654
+PREHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on a different key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join is being followed by a regular join on a bucketed table on a different key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME tbl4) c) (= (. (TOK_TABLE_OR_COL c) value) (. (TOK_TABLE_OR_COL a) value)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {value}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col1
+ Position of Big Table: 0
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 0
+ c
+ TableScan
+ alias: c
+ Reduce Output Operator
+ key expressions:
+ expr: value
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: value
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+PREHOOK: Input: default@tbl4
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl4
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl3.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl4.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+2654
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (working copy)
@@ -1,10 +1,12 @@
PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket
-CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
PREHOOK: type: CREATETABLE
POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket
-CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: default@bucket_small
PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
@@ -337,6 +339,679 @@
#### A masked pattern was here ####
name default.bucket_big
numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
numRows 0
partition_columns ds
rawDataSize 0
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (working copy)
@@ -330,3 +330,500 @@
POSTHOOK: Input: default@bucket_small
#### A masked pattern was here ####
19
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: bucket_big
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: bucket_big
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: bucket_big
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+19
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out (working copy)
@@ -2234,3 +2234,4534 @@
POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
20
+PREHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join
+explain
+select count(*) from (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join
+explain
+select count(*) from (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+22
+PREHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join
+-- Add a order by at the end to make the results deterministic.
+explain
+select key, count(*) from
+(
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+group by key
+order by key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The join is being performed as part of sub-query. It should be converted to a sort-merge join
+-- Add a order by at the end to make the results deterministic.
+explain
+select key, count(*) from
+(
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+group by key
+order by key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-2 depends on stages: Stage-1, Stage-4, Stage-5
+ Stage-8 has a backup stage: Stage-1
+ Stage-5 depends on stages: Stage-8
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-6
+ Conditional Operator
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-8
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select key, count(*) from
+(
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+group by key
+order by key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select key, count(*) from
+(
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+group by key
+order by key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0 9
+2 1
+4 1
+5 9
+8 1
+9 1
+PREHOOK: query: -- The join is being performed as part of more than one sub-query. It should be converted to a sort-merge join
+explain
+select count(*) from
+(
+ select key, count(*) from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1
+ group by key
+) subq2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The join is being performed as part of more than one sub-query. It should be converted to a sort-merge join
+explain
+select count(*) from
+(
+ select key, count(*) from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1
+ group by key
+) subq2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-2 depends on stages: Stage-1, Stage-4, Stage-5
+ Stage-8 has a backup stage: Stage-1
+ Stage-5 depends on stages: Stage-8
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-6
+ Conditional Operator
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:subq1:b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:subq1:b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-8
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+(
+ select key, count(*) from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1
+ group by key
+) subq2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from
+(
+ select key, count(*) from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1
+ group by key
+) subq2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+6
+PREHOOK: query: -- A join is being performed across different sub-queries, where a join is being performed in each of them.
+-- Each sub-query should be converted to a sort-merge join.
+explain
+select src1.key, src1.cnt1, src2.cnt1 from
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1 group by key
+) src1
+join
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq2 group by key
+) src2
+on src1.key = src2.key
+order by src1.key, src1.cnt1, src2.cnt1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A join is being performed across different sub-queries, where a join is being performed in each of them.
+-- Each sub-query should be converted to a sort-merge join.
+explain
+select src1.key, src1.cnt1, src2.cnt1 from
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1 group by key
+) src1
+join
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq2 group by key
+) src2
+on src1.key = src2.key
+order by src1.key, src1.cnt1, src2.cnt1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) src1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) src2) (= (. (TOK_TABLE_OR_COL src1) key) (. (TOK_TABLE_OR_COL src2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) cnt1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src2) cnt1))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL src1) key)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL src1) cnt1)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL src2) cnt1)))))
+
+STAGE DEPENDENCIES:
+ Stage-12 is a root stage , consists of Stage-18, Stage-19, Stage-1
+ Stage-18 has a backup stage: Stage-1
+ Stage-10 depends on stages: Stage-18
+ Stage-9 depends on stages: Stage-1, Stage-5, Stage-10, Stage-11, Stage-13, Stage-14 , consists of Stage-16, Stage-17, Stage-2
+ Stage-16 has a backup stage: Stage-2
+ Stage-7 depends on stages: Stage-16
+ Stage-3 depends on stages: Stage-2, Stage-7, Stage-8
+ Stage-17 has a backup stage: Stage-2
+ Stage-8 depends on stages: Stage-17
+ Stage-2
+ Stage-19 has a backup stage: Stage-1
+ Stage-11 depends on stages: Stage-19
+ Stage-1
+ Stage-15 is a root stage , consists of Stage-20, Stage-21, Stage-5
+ Stage-20 has a backup stage: Stage-5
+ Stage-13 depends on stages: Stage-20
+ Stage-21 has a backup stage: Stage-5
+ Stage-14 depends on stages: Stage-21
+ Stage-5
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-12
+ Conditional Operator
+
+ Stage: Stage-18
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ src1:subq1:b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ src1:subq1:b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-10
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src1:subq1:a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-9
+ Conditional Operator
+
+ Stage: Stage-16
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ $INTNAME1
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ $INTNAME1
+ HashTable Sink Operator
+ condition expressions:
+ 0 {_col0} {_col1}
+ 1 {_col1}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+
+ Stage: Stage-7
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {_col0} {_col1}
+ 1 {_col1}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ outputColumnNames: _col0, _col1, _col3
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ sort order: +++
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-17
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ $INTNAME
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ $INTNAME
+ HashTable Sink Operator
+ condition expressions:
+ 0 {_col0} {_col1}
+ 1 {_col1}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+
+ Stage: Stage-8
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME1
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {_col0} {_col1}
+ 1 {_col1}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ outputColumnNames: _col0, _col1, _col3
+ Position of Big Table: 1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: 1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ expr: _col3
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-19
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ src1:subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ src1:subq1:a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-11
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src1:subq1:b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src1:subq1:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-15
+ Conditional Operator
+
+ Stage: Stage-20
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ src2:subq2:b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ src2:subq2:b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-13
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src2:subq2:a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-21
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ src2:subq2:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ src2:subq2:a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-14
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src2:subq2:b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src2:subq2:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select src1.key, src1.cnt1, src2.cnt1 from
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1 group by key
+) src1
+join
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq2 group by key
+) src2
+on src1.key = src2.key
+order by src1.key, src1.cnt1, src2.cnt1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select src1.key, src1.cnt1, src2.cnt1 from
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1 group by key
+) src1
+join
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq2 group by key
+) src2
+on src1.key = src2.key
+order by src1.key, src1.cnt1, src2.cnt1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0 9 9
+2 1 1
+4 1 1
+5 9 9
+8 1 1
+9 1 1
+PREHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should
+-- be converted to a sort-merge join.
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should
+-- be converted to a sort-merge join.
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
+PREHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should
+-- be converted to a sort-merge join, although there is more than one level of sub-query
+explain
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join tbl2 b
+ on subq2.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should
+-- be converted to a sort-merge join, although there is more than one level of sub-query
+explain
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join tbl2 b
+ on subq2.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL subq2) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join tbl2 b
+ on subq2.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join tbl2 b
+ on subq2.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
+PREHOOK: query: -- Both the tables are nested sub-queries i.e more then 1 level of sub-query.
+-- The join should be converted to a sort-merge join
+explain
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq3
+ where key < 6
+ ) subq4
+ on subq2.key = subq4.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Both the tables are nested sub-queries i.e more then 1 level of sub-query.
+-- The join should be converted to a sort-merge join
+explain
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq3
+ where key < 6
+ ) subq4
+ on subq2.key = subq4.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq3)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq4) (= (. (TOK_TABLE_OR_COL subq2) key) (. (TOK_TABLE_OR_COL subq4) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq4:subq3:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq4:subq3:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq4:subq3:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq3
+ where key < 6
+ ) subq4
+ on subq2.key = subq4.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+#### A masked pattern was here ####
+Execution failed with exit status: 2
+Obtaining error information
+
+Task failed!
+Task ID:
+ Stage-7
+
+Logs:
+
+#### A masked pattern was here ####
+FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapredLocalTask
+ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask
+POSTHOOK: query: select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq3
+ where key < 6
+ ) subq4
+ on subq2.key = subq4.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
+PREHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters and the join key
+-- is not getting modified, it should be converted to a sort-merge join. Note that the sub-query modifies one
+-- item, but that is not part of the join key.
+explain
+select count(*) from
+ (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1
+ join
+ (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2
+ on subq1.key = subq2.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The subquery itself is being joined. Since the sub-query only contains selects and filters and the join key
+-- is not getting modified, it should be converted to a sort-merge join. Note that the sub-query modifies one
+-- item, but that is not part of the join key.
+explain
+select count(*) from
+ (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1
+ join
+ (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2
+ on subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL a) value)) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (TOK_FUNCTION concat (. (TOK_TABLE_OR_COL a) value) (. (TOK_TABLE_OR_COL a) value)) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 8)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 8)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 8)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 8)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 8)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+ (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1
+ join
+ (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2
+ on subq1.key = subq2.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from
+ (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1
+ join
+ (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2
+ on subq1.key = subq2.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
+PREHOOK: query: -- The left table is a sub-query and the right table is not.
+-- It should be converted to a sort-merge join.
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join tbl2 a on subq1.key = a.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The left table is a sub-query and the right table is not.
+-- It should be converted to a sort-merge join.
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join tbl2 a on subq1.key = a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (TOK_TABREF (TOK_TABNAME tbl2) a) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL a) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join tbl2 a on subq1.key = a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join tbl2 a on subq1.key = a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
+PREHOOK: query: -- The right table is a sub-query and the left table is not.
+-- It should be converted to a sort-merge join.
+explain
+select count(*) from tbl1 a
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq1
+ on a.key = subq1.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The right table is a sub-query and the left table is not.
+-- It should be converted to a sort-merge join.
+explain
+select count(*) from tbl1 a
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq1
+ on a.key = subq1.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL subq1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[_col0]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from tbl1 a
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq1
+ on a.key = subq1.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from tbl1 a
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq1
+ on a.key = subq1.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
+PREHOOK: query: -- There are more than 2 inputs to the join, all of them being sub-queries.
+-- It should be converted to to a sort-merge join
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on (subq1.key = subq2.key)
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq3
+ on (subq1.key = subq3.key)
+PREHOOK: type: QUERY
+POSTHOOK: query: -- There are more than 2 inputs to the join, all of them being sub-queries.
+-- It should be converted to to a sort-merge join
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on (subq1.key = subq2.key)
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq3
+ on (subq1.key = subq3.key)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key))) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl2) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq3) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq3) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-7 is a root stage , consists of Stage-8, Stage-9, Stage-10, Stage-1
+ Stage-8 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-8
+ Stage-9 has a backup stage: Stage-1
+ Stage-5 depends on stages: Stage-9
+ Stage-10 has a backup stage: Stage-1
+ Stage-6 depends on stages: Stage-10
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-7
+ Conditional Operator
+
+ Stage: Stage-8
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq2:a
+ Fetch Operator
+ limit: -1
+ subq3:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 0
+ subq3:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 0
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-9
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ subq3:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 1
+ subq3:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 1
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-10
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ subq1:a
+ Fetch Operator
+ limit: -1
+ subq2:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 2
+ subq2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 2
+
+ Stage: Stage-6
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq3:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 2
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key < 6)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0
+ 1
+ 2
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[_col0]]
+ 2 [Column[_col0]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq3
+ on (subq1.key = subq3.key)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+Execution failed with exit status: 2
+Obtaining error information
+
+Task failed!
+Task ID:
+ Stage-10
+
+Logs:
+
+#### A masked pattern was here ####
+FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapredLocalTask
+ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask
+POSTHOOK: query: select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq3
+ on (subq1.key = subq3.key)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+56
+PREHOOK: query: -- The join is being performed on a nested sub-query, and an aggregation is performed after that.
+-- The join should be converted to a sort-merge join
+explain
+select count(*) from (
+ select subq2.key as key, subq2.value as value1, b.value as value2 from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+join tbl2 b
+on subq2.key = b.key) a
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The join is being performed on a nested sub-query, and an aggregation is performed after that.
+-- The join should be converted to a sort-merge join
+explain
+select count(*) from (
+ select subq2.key as key, subq2.value as value1, b.value as value2 from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+join tbl2 b
+on subq2.key = b.key) a
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl1) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 8)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 6)))) subq2) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL subq2) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq2) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a:b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a:b
+ TableScan
+ alias: b
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a:subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a:subq2:subq1:a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a:subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a:b
+ TableScan
+ alias: b
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a:subq2:subq1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key < 8) and (key < 6))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[_col0]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) from (
+ select subq2.key as key, subq2.value as value1, b.value as value2 from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+join tbl2 b
+on subq2.key = b.key) a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from (
+ select subq2.key as key, subq2.value as value1, b.value as value2 from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+join tbl2 b
+on subq2.key = b.key) a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+20
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (working copy)
@@ -373,3 +373,625 @@
POSTHOOK: Input: default@bucket_small@ds=2008-04-09
#### A masked pattern was here ####
38
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small/ds=2008-04-08 [b]
+ /bucket_small/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out (working copy)
@@ -0,0 +1,776 @@
+PREHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tbl1
+PREHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tbl2
+PREHOOK: query: insert overwrite table tbl1 select * from src where key < 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@tbl1
+POSTHOOK: query: insert overwrite table tbl1 select * from src where key < 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@tbl1
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table tbl2 select * from src where key < 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@tbl2
+POSTHOOK: query: insert overwrite table tbl2 select * from src where key < 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@tbl2
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: CREATE TABLE dest1(k1 int, k2 int)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE dest1(k1 int, k2 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@dest1
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: CREATE TABLE dest2(k1 string, k2 string)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE dest2(k1 string, k2 string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@dest2
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: -- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key) key2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key1)) (TOK_SELEXPR (TOK_TABLE_OR_COL key2)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL value1)) (TOK_SELEXPR (TOK_TABLE_OR_COL value2)))))
+
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-0 depends on stages: Stage-2
+ Stage-3 depends on stages: Stage-0
+ Stage-1 depends on stages: Stage-2
+ Stage-4 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col2
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: string
+ expr: _col3
+ type: string
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 2
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest2
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest1
+
+ Stage: Stage-3
+ Stats-Aggr Operator
+
+ Stage: Stage-1
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest2
+
+ Stage: Stage-4
+ Stats-Aggr Operator
+
+
+PREHOOK: query: from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+PREHOOK: Output: default@dest1
+PREHOOK: Output: default@dest2
+POSTHOOK: query: from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Output: default@dest1
+POSTHOOK: Output: default@dest2
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: select * from dest1 order by k1, k2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dest1 order by k1, k2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+2 2
+4 4
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+8 8
+9 9
+PREHOOK: query: select * from dest2 order by k1, k2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dest2 order by k1, k2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_2 val_2
+val_4 val_4
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_8 val_8
+val_9 val_9
+PREHOOK: query: -- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key) key2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key1)) (TOK_SELEXPR (TOK_TABLE_OR_COL key2)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL value1)) (TOK_SELEXPR (TOK_TABLE_OR_COL value2)))))
+
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-0 depends on stages: Stage-2
+ Stage-3 depends on stages: Stage-0
+ Stage-1 depends on stages: Stage-2
+ Stage-4 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col2
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: string
+ expr: _col3
+ type: string
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 2
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest2
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest1
+
+ Stage: Stage-3
+ Stats-Aggr Operator
+
+ Stage: Stage-1
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest2
+
+ Stage: Stage-4
+ Stats-Aggr Operator
+
+
+PREHOOK: query: from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+PREHOOK: Output: default@dest1
+PREHOOK: Output: default@dest2
+POSTHOOK: query: from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Output: default@dest1
+POSTHOOK: Output: default@dest2
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: select * from dest1 order by k1, k2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dest1 order by k1, k2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+2 2
+4 4
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+8 8
+9 9
+PREHOOK: query: select * from dest2 order by k1, k2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dest2 order by k1, k2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_2 val_2
+val_4 val_4
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_8 val_8
+val_9 val_9
+PREHOOK: query: -- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+PREHOOK: type: QUERY
+POSTHOOK: query: -- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) value1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key) key2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) value2)))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key1)) (TOK_SELEXPR (TOK_TABLE_OR_COL key2)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL value1)) (TOK_SELEXPR (TOK_TABLE_OR_COL value2)))))
+
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-0 depends on stages: Stage-2
+ Stage-3 depends on stages: Stage-0
+ Stage-1 depends on stages: Stage-2
+ Stage-4 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq:a
+ TableScan
+ alias: a
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Position of Big Table: 0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col2
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: string
+ expr: _col3
+ type: string
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 2
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest2
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest1
+
+ Stage: Stage-3
+ Stats-Aggr Operator
+
+ Stage: Stage-1
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest2
+
+ Stage: Stage-4
+ Stats-Aggr Operator
+
+
+PREHOOK: query: from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl1
+PREHOOK: Input: default@tbl2
+PREHOOK: Output: default@dest1
+PREHOOK: Output: default@dest2
+POSTHOOK: query: from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl1
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Output: default@dest1
+POSTHOOK: Output: default@dest2
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: select * from dest1 order by k1, k2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dest1 order by k1, k2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+2 2
+4 4
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+5 5
+8 8
+9 9
+PREHOOK: query: select * from dest2 order by k1, k2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dest2 order by k1, k2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k1 SIMPLE [(tbl1)a.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest1.k2 SIMPLE [(tbl2)b.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k1 SIMPLE [(tbl1)a.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: dest2.k2 SIMPLE [(tbl2)b.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_0 val_0
+val_2 val_2
+val_4 val_4
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_5 val_5
+val_8 val_8
+val_9 val_9
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (working copy)
@@ -488,3 +488,775 @@
POSTHOOK: Input: default@bucket_small@ds=2008-04-09
#### A masked pattern was here ####
76
+PREHOOK: query: -- The mapjoin should fail resulting in the sort-merge join
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The mapjoin should fail resulting in the sort-merge join
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small/ds=2008-04-08 [b]
+ /bucket_small/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (working copy)
@@ -361,3 +361,625 @@
POSTHOOK: Input: default@bucket_small@ds=2008-04-09
#### A masked pattern was here ####
38
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 114
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 228
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small/ds=2008-04-08 [b]
+ /bucket_small/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (working copy)
@@ -488,3 +488,773 @@
POSTHOOK: Input: default@bucket_small@ds=2008-04-09
#### A masked pattern was here ####
76
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 452
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small/ds=2008-04-08 [b]
+ /bucket_small/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (revision 1447595)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (working copy)
@@ -261,3 +261,678 @@
POSTHOOK: Input: default@bucket_small@ds=2008-04-08
#### A masked pattern was here ####
38
+PREHOOK: query: -- The mapjoin should fail resulting in the sort-merge join
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The mapjoin should fail resulting in the sort-merge join
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-5 is a root stage , consists of Stage-6, Stage-7, Stage-1
+ Stage-6 has a backup stage: Stage-1
+ Stage-3 depends on stages: Stage-6
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-5
+ Conditional Operator
+
+ Stage: Stage-6
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_small/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q (working copy)
@@ -0,0 +1,82 @@
+set hive.enforce.bucketing = true;
+set hive.enforce.sorting = true;
+set hive.exec.reducers.max = 1;
+
+CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS;
+CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS;
+
+insert overwrite table tbl1 select * from src where key < 10;
+insert overwrite table tbl2 select * from src where key < 10;
+
+CREATE TABLE dest1(k1 int, k2 int);
+CREATE TABLE dest2(k1 string, k2 string);
+
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+
+set hive.auto.convert.sortmerge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+set hive.auto.convert.join=true;
+
+-- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2;
+
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2;
+
+select * from dest1 order by k1, k2;
+select * from dest2 order by k1, k2;
+
+set hive.auto.convert.join.noconditionaltask=true;
+set hive.auto.convert.join.noconditionaltask.size=200;
+
+-- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2;
+
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2;
+
+select * from dest1 order by k1, k2;
+select * from dest2 order by k1, k2;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+
+-- A SMB join followed by a mutli-insert
+explain
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2;
+
+from (
+ SELECT a.key key1, a.value value1, b.key key2, b.value value2
+ FROM tbl1 a JOIN tbl2 b
+ ON a.key = b.key ) subq
+INSERT OVERWRITE TABLE dest1 select key1, key2
+INSERT OVERWRITE TABLE dest2 select value1, value2;
+
+select * from dest1 order by k1, k2;
+select * from dest2 order by k1, k2;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (working copy)
@@ -25,3 +25,7 @@
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (working copy)
@@ -30,3 +30,7 @@
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (working copy)
@@ -27,3 +27,7 @@
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (working copy)
@@ -30,3 +30,11 @@
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+set hive.mapjoin.localtask.max.memory.usage = 0.0001;
+set hive.mapjoin.check.memory.rows = 2;
+
+-- The mapjoin should fail resulting in the sort-merge join
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (working copy)
@@ -1,6 +1,7 @@
-- small 1 part, 2 bucket & big 2 part, 4 bucket
-CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
@@ -28,3 +29,7 @@
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (working copy)
@@ -21,3 +21,8 @@
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.join=true;
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q (working copy)
@@ -281,3 +281,256 @@
) subq2
join tbl2 b
on subq2.key = b.key) a;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+
+-- The join is being performed as part of sub-query. It should be converted to a sort-merge join
+explain
+select count(*) from (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1;
+
+select count(*) from (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1;
+
+-- The join is being performed as part of sub-query. It should be converted to a sort-merge join
+-- Add a order by at the end to make the results deterministic.
+explain
+select key, count(*) from
+(
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+group by key
+order by key;
+
+select key, count(*) from
+(
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+) subq1
+group by key
+order by key;
+
+-- The join is being performed as part of more than one sub-query. It should be converted to a sort-merge join
+explain
+select count(*) from
+(
+ select key, count(*) from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1
+ group by key
+) subq2;
+
+select count(*) from
+(
+ select key, count(*) from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1
+ group by key
+) subq2;
+
+-- A join is being performed across different sub-queries, where a join is being performed in each of them.
+-- Each sub-query should be converted to a sort-merge join.
+explain
+select src1.key, src1.cnt1, src2.cnt1 from
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1 group by key
+) src1
+join
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq2 group by key
+) src2
+on src1.key = src2.key
+order by src1.key, src1.cnt1, src2.cnt1;
+
+select src1.key, src1.cnt1, src2.cnt1 from
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq1 group by key
+) src1
+join
+(
+ select key, count(*) as cnt1 from
+ (
+ select a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key
+ ) subq2 group by key
+) src2
+on src1.key = src2.key
+order by src1.key, src1.cnt1, src2.cnt1;
+
+-- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should
+-- be converted to a sort-merge join.
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key;
+
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key;
+
+-- The subquery itself is being joined. Since the sub-query only contains selects and filters, it should
+-- be converted to a sort-merge join, although there is more than one level of sub-query
+explain
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join tbl2 b
+ on subq2.key = b.key;
+
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join tbl2 b
+ on subq2.key = b.key;
+
+-- Both the tables are nested sub-queries i.e more then 1 level of sub-query.
+-- The join should be converted to a sort-merge join
+explain
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq3
+ where key < 6
+ ) subq4
+ on subq2.key = subq4.key;
+
+select count(*) from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+ join
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq3
+ where key < 6
+ ) subq4
+ on subq2.key = subq4.key;
+
+-- The subquery itself is being joined. Since the sub-query only contains selects and filters and the join key
+-- is not getting modified, it should be converted to a sort-merge join. Note that the sub-query modifies one
+-- item, but that is not part of the join key.
+explain
+select count(*) from
+ (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1
+ join
+ (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2
+ on subq1.key = subq2.key;
+
+select count(*) from
+ (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 8) subq1
+ join
+ (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 8) subq2
+ on subq1.key = subq2.key;
+
+-- The left table is a sub-query and the right table is not.
+-- It should be converted to a sort-merge join.
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join tbl2 a on subq1.key = a.key;
+
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join tbl2 a on subq1.key = a.key;
+
+-- The right table is a sub-query and the left table is not.
+-- It should be converted to a sort-merge join.
+explain
+select count(*) from tbl1 a
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq1
+ on a.key = subq1.key;
+
+select count(*) from tbl1 a
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq1
+ on a.key = subq1.key;
+
+-- There are more than 2 inputs to the join, all of them being sub-queries.
+-- It should be converted to to a sort-merge join
+explain
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on (subq1.key = subq2.key)
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq3
+ on (subq1.key = subq3.key);
+
+select count(*) from
+ (select a.key as key, a.value as value from tbl1 a where key < 6) subq1
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq2
+ on subq1.key = subq2.key
+ join
+ (select a.key as key, a.value as value from tbl2 a where key < 6) subq3
+ on (subq1.key = subq3.key);
+
+-- The join is being performed on a nested sub-query, and an aggregation is performed after that.
+-- The join should be converted to a sort-merge join
+explain
+select count(*) from (
+ select subq2.key as key, subq2.value as value1, b.value as value2 from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+join tbl2 b
+on subq2.key = b.key) a;
+
+select count(*) from (
+ select subq2.key as key, subq2.value as value1, b.value as value2 from
+ (
+ select * from
+ (
+ select a.key as key, a.value as value from tbl1 a where key < 8
+ ) subq1
+ where key < 6
+ ) subq2
+join tbl2 b
+on subq2.key = b.key) a;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (working copy)
@@ -22,3 +22,11 @@
-- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+set hive.mapjoin.localtask.max.memory.usage = 0.0001;
+set hive.mapjoin.check.memory.rows = 2;
+
+-- The mapjoin should fail resulting in the sort-merge join
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (revision 1447595)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (working copy)
@@ -34,3 +34,21 @@
-- A SMB join is being followed by a regular join on a bucketed table on a different key
explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value;
select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value;
+
+set hive.auto.convert.sortmerge.join.noconditionaltask=false;
+
+-- A SMB join is being followed by a regular join on a non-bucketed table on a different key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value;
+select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.value = a.value;
+
+-- A SMB join is being followed by a regular join on a non-bucketed table on the same key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key;
+select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join src c on c.key = a.key;
+
+-- A SMB join is being followed by a regular join on a bucketed table on the same key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key;
+select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl3 c on c.key = a.key;
+
+-- A SMB join is being followed by a regular join on a bucketed table on a different key
+explain select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value;
+select count(*) FROM tbl1 a JOIN tbl2 b ON a.key = b.key join tbl4 c on c.value = a.value;
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (working copy)
@@ -44,6 +44,7 @@
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.ScriptOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
@@ -74,6 +75,7 @@
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
@@ -220,14 +222,20 @@
}
public static String genMapJoinOpAndLocalWork(MapredWork newWork, JoinOperator op, int mapJoinPos)
- throws SemanticException {
+ throws SemanticException {
+ LinkedHashMap, OpParseContext> opParseCtxMap =
+ newWork.getOpParseCtxMap();
+ QBJoinTree newJoinTree = newWork.getJoinTree();
+ // generate the map join operator; already checked the map join
+ MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op,
+ newJoinTree, mapJoinPos, true, false);
+ return genMapJoinOpAndLocalWork(newWork, newMapJoinOp, mapJoinPos);
+ }
+
+ public static String genMapJoinOpAndLocalWork(MapredWork newWork, MapJoinOperator newMapJoinOp,
+ int mapJoinPos)
+ throws SemanticException {
try {
- LinkedHashMap, OpParseContext> opParseCtxMap =
- newWork.getOpParseCtxMap();
- QBJoinTree newJoinTree = newWork.getJoinTree();
- // generate the map join operator; already checked the map join
- MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op,
- newJoinTree, mapJoinPos, true, false);
// generate the local work and return the big table alias
String bigTableAlias = MapJoinProcessor
.genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos);
@@ -502,6 +510,64 @@
return mapJoinOp;
}
+ /**
+ * convert a sortmerge join to a a map-side join.
+ *
+ * @param opParseCtxMap
+ * @param smbJoinOp
+ * join operator
+ * @param joinTree
+ * qb join tree
+ * @param mapJoinPos
+ * position of the source to be read as part of map-reduce framework. All other sources
+ * are cached in memory
+ * @param noCheckOuterJoin
+ */
+ public static MapJoinOperator convertSMBJoinToMapJoin(
+ Map, OpParseContext> opParseCtxMap,
+ SMBMapJoinOperator smbJoinOp, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin)
+ throws SemanticException {
+ // Create a new map join operator
+ SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
+ List keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
+ TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(PlanUtils
+ .getFieldSchemasFromColumnList(keyCols, "mapjoinkey"));
+ MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(),
+ keyTableDesc, smbJoinDesc.getExprs(),
+ smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(),
+ smbJoinDesc.getOutputColumnNames(),
+ mapJoinPos, smbJoinDesc.getConds(),
+ smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix());
+
+ RowResolver joinRS = opParseCtxMap.get(smbJoinOp).getRowResolver();
+ MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(
+ mapJoinDesc, joinRS.getRowSchema(),
+ new ArrayList>());
+
+ OpParseContext ctx = new OpParseContext(joinRS);
+ opParseCtxMap.put(mapJoinOp, ctx);
+
+ // change the children of the original join operator to point to the map
+ // join operator
+ List> childOps = smbJoinOp.getChildOperators();
+ for (Operator extends OperatorDesc> childOp : childOps) {
+ childOp.replaceParent(smbJoinOp, mapJoinOp);
+ }
+ mapJoinOp.setChildOperators(childOps);
+ smbJoinOp.setChildOperators(null);
+
+ // change the parent of the original SMBjoin operator to point to the map
+ // join operator
+ List> parentOps = smbJoinOp.getParentOperators();
+ for (Operator extends OperatorDesc> parentOp : parentOps) {
+ parentOp.replaceChild(smbJoinOp, mapJoinOp);
+ }
+ mapJoinOp.setParentOperators(parentOps);
+ smbJoinOp.setParentOperators(null);
+
+ return mapJoinOp;
+ }
+
public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator op,
QBJoinTree joinTree, int mapJoinPos) throws SemanticException {
HiveConf hiveConf = pctx.getConf();
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (working copy)
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.physical;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.TaskGraphWalker;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/*
+ * If a join has been automatically converted into a sort-merge join, create a conditional
+ * task to try map-side join with each table as the big table. It is similar to
+ * hive.auto.convert.join, but is only applicable to joins which have been automatically
+ * converted to sort-merge joins. For hive.auto.convert.join, the backup task is the
+ * map-reduce join, whereas here, the backup task is the sort-merge join.
+ *
+ * Depending on the inputs, a sort-merge join may be faster or slower than the map-side join.
+ * The other advantage of sort-merge join is that the output is also bucketed and sorted.
+ * Consider a very big table, say 1TB with 10 buckets being joined with a very small table, say
+ * 10MB with 10 buckets, the sort-merge join may perform slower since it will be restricted to
+ * 10 mappers.
+ */
+public class SortMergeJoinResolver implements PhysicalPlanResolver {
+ @Override
+ public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
+
+ // create dispatcher and graph walker
+ Dispatcher disp = new SortMergeJoinTaskDispatcher(pctx);
+ TaskGraphWalker ogw = new TaskGraphWalker(disp);
+
+ // get all the tasks nodes from root task
+ List topNodes = new ArrayList();
+ topNodes.addAll(pctx.rootTasks);
+
+ // begin to walk through the task tree.
+ ogw.startWalking(topNodes, null);
+ return pctx;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java (working copy)
@@ -0,0 +1,191 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.physical;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.exec.ConditionalTask;
+import org.apache.hadoop.hive.ql.exec.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.MapredWork;
+
+/**
+ * Common iteration methods for converting joins and sort-merge joins.
+ */
+public abstract class AbstractJoinTaskDispatcher implements Dispatcher {
+
+ protected final PhysicalContext physicalContext;
+
+ public AbstractJoinTaskDispatcher(PhysicalContext context) {
+ physicalContext = context;
+ }
+
+ public abstract Task extends Serializable> processCurrentTask(MapRedTask currTask,
+ ConditionalTask conditionalTask, Context context)
+ throws SemanticException;
+
+ protected void replaceTaskWithConditionalTask(
+ Task extends Serializable> currTask, ConditionalTask cndTsk,
+ PhysicalContext physicalContext) {
+ // add this task into task tree
+ // set all parent tasks
+ List> parentTasks = currTask.getParentTasks();
+ currTask.setParentTasks(null);
+ if (parentTasks != null) {
+ for (Task extends Serializable> tsk : parentTasks) {
+ // make new generated task depends on all the parent tasks of current task.
+ tsk.addDependentTask(cndTsk);
+ // remove the current task from its original parent task's dependent task
+ tsk.removeDependentTask(currTask);
+ }
+ } else {
+ // remove from current root task and add conditional task to root tasks
+ physicalContext.removeFromRootTask(currTask);
+ physicalContext.addToRootTask(cndTsk);
+ }
+ // set all child tasks
+ List> oldChildTasks = currTask.getChildTasks();
+ if (oldChildTasks != null) {
+ for (Task extends Serializable> tsk : cndTsk.getListTasks()) {
+ if (tsk.equals(currTask)) {
+ continue;
+ }
+ for (Task extends Serializable> oldChild : oldChildTasks) {
+ tsk.addDependentTask(oldChild);
+ }
+ }
+ }
+ }
+
+ // Replace the task with the new task. Copy the children and parents of the old
+ // task to the new task.
+ protected void replaceTask(
+ Task extends Serializable> currTask, Task extends Serializable> newTask,
+ PhysicalContext physicalContext) {
+ // add this task into task tree
+ // set all parent tasks
+ List> parentTasks = currTask.getParentTasks();
+ currTask.setParentTasks(null);
+ if (parentTasks != null) {
+ for (Task extends Serializable> tsk : parentTasks) {
+ // make new generated task depends on all the parent tasks of current task.
+ tsk.addDependentTask(newTask);
+ // remove the current task from its original parent task's dependent task
+ tsk.removeDependentTask(currTask);
+ }
+ } else {
+ // remove from current root task and add conditional task to root tasks
+ physicalContext.removeFromRootTask(currTask);
+ physicalContext.addToRootTask(newTask);
+ }
+
+ // set all child tasks
+ List> oldChildTasks = currTask.getChildTasks();
+ currTask.setChildTasks(null);
+ if (oldChildTasks != null) {
+ for (Task extends Serializable> tsk : oldChildTasks) {
+ // make new generated task depends on all the parent tasks of current task.
+ newTask.addDependentTask(tsk);
+ // remove the current task from its original parent task's dependent task
+ tsk.getParentTasks().remove(currTask);
+ }
+ }
+ }
+
+ public ObjectPair, Long>
+ getAliasToSize(Context context, MapredWork currWork,
+ Map> pathToAliases) throws SemanticException {
+ HashMap aliasToSize = new HashMap();
+
+ try {
+ // go over all the input paths, and calculate a known total size, known
+ // size for each input alias.
+ Utilities.getInputSummary(context, currWork, null).getLength();
+
+ // set alias to size mapping, this can be used to determine if one table
+ // is chosen as big table, what's the total size of left tables, which
+ // are going to be small tables.
+ Long aliasTotalKnownInputSize = 0L;
+ for (Map.Entry> entry : pathToAliases.entrySet()) {
+ String path = entry.getKey();
+ List aliasList = entry.getValue();
+ ContentSummary cs = context.getCS(path);
+ if (cs != null) {
+ long size = cs.getLength();
+ for (String alias : aliasList) {
+ aliasTotalKnownInputSize += size;
+ Long es = aliasToSize.get(alias);
+ if (es == null) {
+ es = new Long(0);
+ }
+ es += size;
+ aliasToSize.put(alias, es);
+ }
+ }
+ }
+ return new ObjectPair, Long>(aliasToSize, aliasTotalKnownInputSize);
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
+ }
+ }
+
+ @Override
+ public Object dispatch(Node nd, Stack stack, Object... nodeOutputs)
+ throws SemanticException {
+ if (nodeOutputs == null || nodeOutputs.length == 0) {
+ throw new SemanticException("No Dispatch Context");
+ }
+
+ TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0];
+
+ Task extends Serializable> currTask = (Task extends Serializable>) nd;
+ // not map reduce task or not conditional task, just skip
+ if (currTask.isMapRedTask()) {
+ if (currTask instanceof ConditionalTask) {
+ // get the list of task
+ List> taskList = ((ConditionalTask) currTask).getListTasks();
+ for (Task extends Serializable> tsk : taskList) {
+ if (tsk.isMapRedTask()) {
+ Task extends Serializable> newTask = this.processCurrentTask((MapRedTask) tsk,
+ ((ConditionalTask) currTask), physicalContext.getContext());
+ walkerCtx.addToDispatchList(newTask);
+ }
+ }
+ } else {
+ Task extends Serializable> newTask =
+ this.processCurrentTask((MapRedTask) currTask, null, physicalContext.getContext());
+ walkerCtx.addToDispatchList(newTask);
+ }
+ }
+ return null;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (working copy)
@@ -0,0 +1,457 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.physical;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.exec.ConditionalTask;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx;
+import org.apache.hadoop.hive.ql.plan.ConditionalWork;
+import org.apache.hadoop.hive.ql.plan.JoinDesc;
+import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
+import org.apache.hadoop.hive.ql.plan.MapredWork;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+
+/*
+ * Convert tasks involving JOIN into MAPJOIN.
+ * If hive.auto.convert.join is true, the tasks involving join are converted.
+ * Consider the query:
+ * select .... from T1 join T2 on T1.key = T2.key join T3 on T1.key = T3.key
+ *
+ * There is a map-reduce task which performs a 3-way join (T1, T2, T3).
+ * The task would be converted to a conditional task which would have 4 children
+ * a. Mapjoin considering T1 as the big table
+ * b. Mapjoin considering T2 as the big table
+ * c. Mapjoin considering T3 as the big table
+ * d. Map-reduce join (the original task).
+ *
+ * Note that the sizes of all the inputs may not be available at compile time. At runtime, it is
+ * determined which branch we want to pick up from the above.
+ *
+ * However, if hive.auto.convert.join.noconditionaltask is set to true, and
+ * the sum of any n-1 tables is smaller than hive.auto.convert.join.noconditionaltask.size,
+ * then a mapjoin is created instead of the conditional task. For the above, if the size of
+ * T1 + T2 is less than the threshold, then the task is converted to a mapjoin task with T3 as
+ * the big table.
+ *
+ * In this case, further optimization is performed by merging 2 consecutive map-only jobs.
+ * Consider the query:
+ * select ... from T1 join T2 on T1.key1 = T2.key1 join T3 on T1.key2 = T3.key2
+ *
+ * Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2)
+ * followed by another map-reduce job (to perform join of the result with T3). After the
+ * optimization, both these tasks would be converted to map-only tasks. These 2 map-only jobs
+ * are then merged into a single map-only job. As a followup (HIVE-3952), it would be possible to
+ * merge a map-only task with a map-reduce task.
+ * Consider the query:
+ * select T1.key2, count(*) from T1 join T2 on T1.key1 = T2.key1 group by T1.key2;
+ * Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2)
+ * followed by another map-reduce job (to perform groupby of the result). After the
+ * optimization, the join task would be converted to map-only tasks. After HIVE-3952, the map-only
+ * task would be merged with the map-reduce task to create a single map-reduce task.
+ */
+
+/**
+ * Iterator each tasks. If this task has a local work,create a new task for this local work, named
+ * MapredLocalTask. then make this new generated task depends on current task's parent task, and
+ * make current task depends on this new generated task
+ */
+public class CommonJoinTaskDispatcher extends AbstractJoinTaskDispatcher implements Dispatcher {
+
+ public CommonJoinTaskDispatcher(PhysicalContext context) {
+ super(context);
+ }
+
+ // Get the position of the big table for this join operator and the given alias
+ private int getPosition(MapredWork work, Operator extends OperatorDesc> joinOp,
+ String alias) {
+ Operator extends OperatorDesc> parentOp = work.getAliasToWork().get(alias);
+
+ // reduceSinkOperator's child is null, but joinOperator's parents is reduceSink
+ while ((parentOp.getChildOperators() != null) &&
+ (!parentOp.getChildOperators().isEmpty())) {
+ parentOp = parentOp.getChildOperators().get(0);
+ }
+
+ return joinOp.getParentOperators().indexOf(parentOp);
+ }
+
+ /*
+ * A task and its child task has been converted from join to mapjoin.
+ * See if the two tasks can be merged.
+ */
+ private void mergeMapJoinTaskWithChildMapJoinTask(MapRedTask task) {
+ MapRedTask childTask = (MapRedTask) task.getChildTasks().get(0);
+ MapredWork work = task.getWork();
+ MapredLocalWork localWork = work.getMapLocalWork();
+ MapredWork childWork = childTask.getWork();
+ MapredLocalWork childLocalWork = childWork.getMapLocalWork();
+
+ // Can this be merged
+ Map> aliasToWork = work.getAliasToWork();
+ if (aliasToWork.size() > 1) {
+ return;
+ }
+
+ Operator extends OperatorDesc> op = aliasToWork.values().iterator().next();
+ while (op.getChildOperators() != null) {
+ // Dont perform this optimization for multi-table inserts
+ if (op.getChildOperators().size() > 1) {
+ return;
+ }
+ op = op.getChildOperators().get(0);
+ }
+
+ if (!(op instanceof FileSinkOperator)) {
+ return;
+ }
+
+ FileSinkOperator fop = (FileSinkOperator) op;
+ String workDir = fop.getConf().getDirName();
+
+ Map> childPathToAliases = childWork.getPathToAliases();
+ if (childPathToAliases.size() > 1) {
+ return;
+ }
+
+ // The filesink writes to a different directory
+ if (!childPathToAliases.keySet().iterator().next().equals(workDir)) {
+ return;
+ }
+
+ // Either of them should not be bucketed
+ if ((localWork.getBucketMapjoinContext() != null) ||
+ (childLocalWork.getBucketMapjoinContext() != null)) {
+ return;
+ }
+
+ // Merge the trees
+ if (childWork.getAliasToWork().size() > 1) {
+ return;
+ }
+
+ Operator extends Serializable> childAliasOp =
+ childWork.getAliasToWork().values().iterator().next();
+ if (fop.getParentOperators().size() > 1) {
+ return;
+ }
+
+ // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the
+ // top of the second
+ Operator extends Serializable> parentFOp = fop.getParentOperators().get(0);
+ parentFOp.getChildOperators().remove(fop);
+ parentFOp.getChildOperators().add(childAliasOp);
+ List> parentOps =
+ new ArrayList>();
+ parentOps.add(parentFOp);
+ childAliasOp.setParentOperators(parentOps);
+
+ work.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo());
+ for (Map.Entry childWorkEntry : childWork.getPathToPartitionInfo()
+ .entrySet()) {
+ if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) {
+ work.getPathToPartitionInfo().put(childWorkEntry.getKey(), childWorkEntry.getValue());
+ }
+ }
+
+ localWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork());
+ localWork.getAliasToWork().putAll(childLocalWork.getAliasToWork());
+
+ // remove the child task
+ List> oldChildTasks = childTask.getChildTasks();
+ task.setChildTasks(oldChildTasks);
+ if (oldChildTasks != null) {
+ for (Task extends Serializable> oldChildTask : oldChildTasks) {
+ oldChildTask.getParentTasks().remove(childTask);
+ oldChildTask.getParentTasks().add(task);
+ }
+ }
+ }
+
+ // create map join task and set big table as bigTablePosition
+ private ObjectPair convertTaskToMapJoinTask(String xml,
+ int bigTablePosition) throws UnsupportedEncodingException, SemanticException {
+ // deep copy a new mapred work from xml
+ InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8"));
+ MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf());
+ // create a mapred task for this work
+ MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
+ .getParseContext().getConf());
+ JoinOperator newJoinOp = getJoinOp(newTask);
+
+ // optimize this newWork given the big table position
+ String bigTableAlias =
+ MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, bigTablePosition);
+ return new ObjectPair(newTask, bigTableAlias);
+ }
+
+ @Override
+ public Task extends Serializable> processCurrentTask(MapRedTask currTask,
+ ConditionalTask conditionalTask, Context context)
+ throws SemanticException {
+
+ // whether it contains common join op; if contains, return this common join op
+ JoinOperator joinOp = getJoinOp(currTask);
+ if (joinOp == null) {
+ return null;
+ }
+ currTask.setTaskTag(Task.COMMON_JOIN);
+
+ MapredWork currWork = currTask.getWork();
+
+ // create conditional work list and task list
+ List listWorks = new ArrayList();
+ List> listTasks = new ArrayList>();
+
+ // create alias to task mapping and alias to input file mapping for resolver
+ HashMap> aliasToTask =
+ new HashMap>();
+ HashMap> pathToAliases = currWork.getPathToAliases();
+ Map> aliasToWork = currWork.getAliasToWork();
+
+ // get parseCtx for this Join Operator
+ ParseContext parseCtx = physicalContext.getParseContext();
+ QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp);
+
+ // start to generate multiple map join tasks
+ JoinDesc joinDesc = joinOp.getConf();
+ Byte[] order = joinDesc.getTagOrder();
+ int numAliases = order.length;
+
+ HashMap aliasToSize;
+
+ try {
+ ObjectPair, Long> aliasToSizePair =
+ getAliasToSize(context, currWork, pathToAliases);
+ long aliasTotalKnownInputSize = aliasToSizePair.getSecond();
+ aliasToSize = aliasToSizePair.getFirst();
+
+ HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc
+ .getConds());
+
+ // no table could be the big table; there is no need to convert
+ if (bigTableCandidates == null) {
+ return null;
+ }
+
+ Configuration conf = context.getConf();
+
+ // If sizes of atleast n-1 tables in a n-way join is known, and their sum is smaller than
+ // the threshold size, convert the join into map-join and don't create a conditional task
+ boolean convertJoinMapJoin = HiveConf.getBoolVar(conf,
+ HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
+ int bigTablePosition = -1;
+ if (convertJoinMapJoin) {
+ // This is the threshold that the user has specified to fit in mapjoin
+ long mapJoinSize = HiveConf.getLongVar(conf,
+ HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
+
+ boolean bigTableFound = false;
+ long largestTableSize = 0;
+ long sumTableSizes = 0;
+ for (String alias : aliasToWork.keySet()) {
+ Long size = aliasToSize.get(alias);
+ // The size is not available at compile time if the input is a sub-query.
+ // If the size of atleast n-1 inputs for a n-way join are available at compile time,
+ // and the sum of them is less than the specified threshold, then convert the join
+ // into a map-join without the conditional task.
+ if ((size == null) || (size > mapJoinSize)) {
+ sumTableSizes += largestTableSize;
+ if (bigTableFound || (sumTableSizes > mapJoinSize)) {
+ convertJoinMapJoin = false;
+ break;
+ }
+ bigTableFound = true;
+ bigTablePosition = getPosition(currWork, joinOp, alias);
+ largestTableSize = mapJoinSize + 1;
+ } else {
+ if (size > largestTableSize) {
+ sumTableSizes += largestTableSize;
+ largestTableSize = size;
+ bigTablePosition = getPosition(currWork, joinOp, alias);
+ } else {
+ sumTableSizes += size;
+ }
+ if (sumTableSizes > mapJoinSize) {
+ convertJoinMapJoin = false;
+ break;
+ }
+ }
+ }
+ }
+
+ String bigTableAlias = null;
+ currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
+ currWork.setJoinTree(joinTree);
+ String xml = currWork.toXML();
+
+ if (convertJoinMapJoin) {
+ // create map join task and set big table as bigTablePosition
+ MapRedTask newTask = convertTaskToMapJoinTask(xml, bigTablePosition).getFirst();
+
+ newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
+ replaceTask(currTask, newTask, physicalContext);
+
+ // Can this task be merged with the child task. This can happen if a big table is being
+ // joined with multiple small tables on different keys
+ // Further optimizations are possible here, a join which has been converted to a mapjoin
+ // followed by a mapjoin can be performed in a single MR job.
+ if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)
+ && (newTask.getChildTasks().get(0).getTaskTag() == Task.MAPJOIN_ONLY_NOBACKUP)) {
+ mergeMapJoinTaskWithChildMapJoinTask(newTask);
+ }
+
+ return newTask;
+ }
+
+ long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
+ HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
+ for (int i = 0; i < numAliases; i++) {
+ // this table cannot be big table
+ if (!bigTableCandidates.contains(i)) {
+ continue;
+ }
+
+ // create map join task and set big table as i
+ ObjectPair newTaskAlias = convertTaskToMapJoinTask(xml, i);
+ MapRedTask newTask = newTaskAlias.getFirst();
+ bigTableAlias = newTaskAlias.getSecond();
+
+ Long aliasKnownSize = aliasToSize.get(bigTableAlias);
+ if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) {
+ long smallTblTotalKnownSize = aliasTotalKnownInputSize
+ - aliasKnownSize.longValue();
+ if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
+ // this table is not good to be a big table.
+ continue;
+ }
+ }
+
+ // add into conditional task
+ listWorks.add(newTask.getWork());
+ listTasks.add(newTask);
+ newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
+
+ // set up backup task
+ newTask.setBackupTask(currTask);
+ newTask.setBackupChildrenTasks(currTask.getChildTasks());
+
+ // put the mapping alias to task
+ aliasToTask.put(bigTableAlias, newTask);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
+ }
+
+ // insert current common join task to conditional task
+ listWorks.add(currTask.getWork());
+ listTasks.add(currTask);
+ // clear JoinTree and OP Parse Context
+ currWork.setOpParseCtxMap(null);
+ currWork.setJoinTree(null);
+
+ // create conditional task and insert conditional task into task tree
+ ConditionalWork cndWork = new ConditionalWork(listWorks);
+ ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
+ cndTsk.setListTasks(listTasks);
+
+ // set resolver and resolver context
+ cndTsk.setResolver(new ConditionalResolverCommonJoin());
+ ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
+ resolverCtx.setPathToAliases(pathToAliases);
+ resolverCtx.setAliasToKnownSize(aliasToSize);
+ resolverCtx.setAliasToTask(aliasToTask);
+ resolverCtx.setCommonJoinTask(currTask);
+ resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
+ resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
+ cndTsk.setResolverCtx(resolverCtx);
+
+ // replace the current task with the new generated conditional task
+ replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext);
+ return cndTsk;
+ }
+
+ /*
+ * If any operator which does not allow map-side conversion is present in the mapper, dont
+ * convert it into a conditional task.
+ */
+ private boolean checkOperatorOKMapJoinConversion(Operator extends OperatorDesc> op) {
+ if (!op.opAllowedConvertMapJoin()) {
+ return false;
+ }
+
+ if (op.getChildOperators() == null) {
+ return true;
+ }
+
+ for (Operator extends OperatorDesc> childOp : op.getChildOperators()) {
+ if (!checkOperatorOKMapJoinConversion(childOp)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private JoinOperator getJoinOp(MapRedTask task) throws SemanticException {
+ MapredWork work = task.getWork();
+ if (work == null) {
+ return null;
+ }
+ Operator extends OperatorDesc> reducerOp = work.getReducer();
+ if (reducerOp instanceof JoinOperator) {
+ /* Is any operator present, which prevents the conversion */
+ Map> aliasToWork = work.getAliasToWork();
+ for (Operator extends OperatorDesc> op : aliasToWork.values()) {
+ if (!checkOperatorOKMapJoinConversion(op)) {
+ return null;
+ }
+ }
+ return (JoinOperator) reducerOp;
+ } else {
+ return null;
+ }
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (working copy)
@@ -51,7 +51,15 @@
}
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) {
resolvers.add(new CommonJoinResolver());
+
+ // The joins have been automatically converted to map-joins.
+ // However, if the joins were converted to sort-merge joins automatically,
+ // they should also be tried as map-joins.
+ if (!hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_AUTO_SORTMERGE_JOIN_NOCONDITIONALTASK)) {
+ resolvers.add(new SortMergeJoinResolver());
+ }
}
+
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER)) {
resolvers.add(new IndexWhereResolver());
}
@@ -61,7 +69,7 @@
}
// Physical optimizers which follow this need to be careful not to invalidate the inferences
- // made by this optimizer. Only optimizers which depend on the results of this one should
+ // made by this optimizer. Only optimizers which depend on the results of this one should
// follow it.
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT)) {
resolvers.add(new BucketingSortingInferenceOptimizer());
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (working copy)
@@ -0,0 +1,417 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.physical;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.exec.ConditionalTask;
+import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor;
+import org.apache.hadoop.hive.ql.parse.OpParseContext;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx;
+import org.apache.hadoop.hive.ql.plan.ConditionalWork;
+import org.apache.hadoop.hive.ql.plan.FetchWork;
+import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
+import org.apache.hadoop.hive.ql.plan.MapredWork;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
+
+/**
+ * Iterator over each task. If the task has a smb join, convert the task to a conditional task.
+ * The conditional task will first try all mapjoin possibilities, and go the the smb join if the
+ * mapjoin fails. The smb join will be a backup task for all the mapjoin tasks.
+ */
+public class SortMergeJoinTaskDispatcher extends AbstractJoinTaskDispatcher implements Dispatcher {
+
+ public SortMergeJoinTaskDispatcher(PhysicalContext context) {
+ super(context);
+ }
+
+ // Convert the work in the SMB plan to a regular join
+ // Note that the operator tree is not fixed, only the path/alias mappings in the
+ // plan are fixed. The operator tree will still contain the SMBJoinOperator
+ private void genSMBJoinWork(MapredWork currWork, SMBMapJoinOperator smbJoinOp) {
+ // Remove the paths which are not part of aliasToPartitionInfo
+ Map aliasToPartitionInfo = currWork.getAliasToPartnInfo();
+ List removePaths = new ArrayList();
+
+ for (Map.Entry> entry : currWork.getPathToAliases().entrySet()) {
+ boolean keepPath = false;
+ for (String alias : entry.getValue()) {
+ if (aliasToPartitionInfo.containsKey(alias)) {
+ keepPath = true;
+ break;
+ }
+ }
+
+ // Remove if the path is not present
+ if (!keepPath) {
+ removePaths.add(entry.getKey());
+ }
+ }
+
+ List removeAliases = new ArrayList();
+ for (String removePath : removePaths) {
+ removeAliases.addAll(currWork.getPathToAliases().get(removePath));
+ currWork.getPathToAliases().remove(removePath);
+ currWork.getPathToPartitionInfo().remove(removePath);
+ }
+
+ for (String alias : removeAliases) {
+ currWork.getAliasToPartnInfo().remove(alias);
+ currWork.getAliasToWork().remove(alias);
+ }
+
+ // Get the MapredLocalWork
+ MapredLocalWork localWork = smbJoinOp.getConf().getLocalWork();
+
+ for (Map.Entry> entry : localWork.getAliasToWork()
+ .entrySet()) {
+ String alias = entry.getKey();
+ Operator extends OperatorDesc> op = entry.getValue();
+ FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias);
+
+ // Add the entry in mapredwork
+ currWork.getAliasToWork().put(alias, op);
+
+ PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias);
+ if (fetchWork.getTblDir() != null) {
+ ArrayList aliases = new ArrayList();
+ aliases.add(alias);
+ currWork.getPathToAliases().put(fetchWork.getTblDir(), aliases);
+ currWork.getPathToPartitionInfo().put(fetchWork.getTblDir(), partitionInfo);
+ }
+ else {
+ for (String pathDir : fetchWork.getPartDir()) {
+ ArrayList aliases = new ArrayList();
+ aliases.add(alias);
+ currWork.getPathToAliases().put(pathDir, aliases);
+ currWork.getPathToPartitionInfo().put(pathDir, partitionInfo);
+ }
+ }
+ }
+
+ // Remove the dummy store operator from the tree
+ for (Operator extends OperatorDesc> parentOp : smbJoinOp.getParentOperators()) {
+ if (parentOp instanceof DummyStoreOperator) {
+ Operator extends OperatorDesc> grandParentOp = parentOp.getParentOperators().get(0);
+ smbJoinOp.replaceParent(parentOp, grandParentOp);
+ grandParentOp.setChildOperators(parentOp.getChildOperators());
+ parentOp.setParentOperators(null);
+ parentOp.setParentOperators(null);
+ }
+ }
+ }
+
+ /*
+ * Convert the work containing to sort-merge join into a work, as if it had a regular join.
+ * Note that the operator tree is not changed - is still contains the SMB join, but the
+ * plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
+ */
+ private MapredWork convertSMBWorkToJoinWork(MapredWork currWork, SMBMapJoinOperator oldSMBJoinOp)
+ throws SemanticException {
+ try {
+ String xml = currWork.toXML();
+
+ // deep copy a new mapred work
+ InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8"));
+ MapredWork currJoinWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf());
+ SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
+
+ // Add the row resolver for the new operator
+ Map, OpParseContext> opParseContextMap =
+ physicalContext.getParseContext().getOpParseCtx();
+ opParseContextMap.put(newSMBJoinOp, opParseContextMap.get(oldSMBJoinOp));
+ // change the newly created map-red plan as if it was a join operator
+ genSMBJoinWork(currJoinWork, newSMBJoinOp);
+ return currJoinWork;
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
+ }
+ }
+
+ // create map join task and set big table as bigTablePosition
+ private ObjectPair convertSMBTaskToMapJoinTask(String xml,
+ int bigTablePosition,
+ SMBMapJoinOperator smbJoinOp,
+ QBJoinTree joinTree)
+ throws UnsupportedEncodingException, SemanticException {
+ // deep copy a new mapred work from xml
+ InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8"));
+ MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf());
+ // create a mapred task for this work
+ MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
+ .getParseContext().getConf());
+ // generate the map join operator; already checked the map join
+ MapJoinOperator newMapJoinOp =
+ getMapJoinOperator(newTask, newWork, smbJoinOp, joinTree, bigTablePosition);
+ // The reducer needs to be restored - Consider a query like:
+ // select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+ // The reducer contains a groupby, which needs to be restored.
+ Operator extends OperatorDesc> reducer = newWork.getReducer();
+
+ // create the local work for this plan
+ String bigTableAlias =
+ MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newMapJoinOp, bigTablePosition);
+
+ // restore the reducer
+ newWork.setReducer(reducer);
+ return new ObjectPair(newTask, bigTableAlias);
+ }
+
+ @Override
+ public Task extends Serializable> processCurrentTask(MapRedTask currTask,
+ ConditionalTask conditionalTask, Context context)
+ throws SemanticException {
+
+ // whether it contains a sort merge join operator
+ MapredWork currWork = currTask.getWork();
+ SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
+ if (originalSMBJoinOp == null) {
+ return null;
+ }
+
+ // Only create a map-join if the user explicitly gave a join (without a mapjoin hint)
+ if (!originalSMBJoinOp.isConvertedAutomaticallySMBJoin()) {
+ return null;
+ }
+ currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
+
+ // get parseCtx for this Join Operator
+ ParseContext parseCtx = physicalContext.getParseContext();
+ QBJoinTree joinTree = parseCtx.getSmbMapJoinContext().get(originalSMBJoinOp);
+
+ // Convert the work containing to sort-merge join into a work, as if it had a regular join.
+ // Note that the operator tree is not changed - is still contains the SMB join, but the
+ // plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
+ // This is used to convert the plan to a map-join, and then the original SMB join plan is used
+ // as a backup task.
+ MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
+ SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
+
+ currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
+ currWork.setJoinTree(joinTree);
+ currJoinWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
+ currJoinWork.setJoinTree(joinTree);
+
+ // create conditional work list and task list
+ List listWorks = new ArrayList();
+ List> listTasks = new ArrayList>();
+
+ // create alias to task mapping and alias to input file mapping for resolver
+ HashMap> aliasToTask =
+ new HashMap>();
+ // Note that pathToAlias will behave as if the original plan was a join plan
+ HashMap> pathToAliases = currJoinWork.getPathToAliases();
+
+ // generate a map join task for the big table
+ SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
+ Byte[] order = originalSMBJoinDesc.getTagOrder();
+ int numAliases = order.length;
+ Set bigTableCandidates =
+ MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
+
+ // no table could be the big table; there is no need to convert
+ if (bigTableCandidates == null) {
+ return null;
+ }
+
+ HashMap aliasToSize;
+ Configuration conf = context.getConf();
+ try {
+ ObjectPair, Long> aliasToSizePair =
+ getAliasToSize(context, currJoinWork, pathToAliases);
+ long aliasTotalKnownInputSize = aliasToSizePair.getSecond();
+ aliasToSize = aliasToSizePair.getFirst();
+
+ String xml = currJoinWork.toXML();
+ long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
+ HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
+
+ for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
+ // this table cannot be big table
+ if (!bigTableCandidates.contains(bigTablePosition)) {
+ continue;
+ }
+
+ // create map join task for the given big table position
+ ObjectPair newTaskAlias =
+ convertSMBTaskToMapJoinTask(xml, bigTablePosition, newSMBJoinOp, joinTree);
+ MapRedTask newTask = newTaskAlias.getFirst();
+ String bigTableAlias = newTaskAlias.getSecond();
+
+ Long aliasKnownSize = aliasToSize.get(bigTableAlias);
+ if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) {
+ long smallTblTotalKnownSize = aliasTotalKnownInputSize
+ - aliasKnownSize.longValue();
+ if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
+ // this table is not good to be a big table.
+ continue;
+ }
+ }
+
+ // add into conditional task
+ listWorks.add(newTask.getWork());
+ listTasks.add(newTask);
+ newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
+
+ // set up backup task
+ newTask.setBackupTask(currTask);
+ newTask.setBackupChildrenTasks(currTask.getChildTasks());
+
+ // put the mapping alias to task
+ aliasToTask.put(bigTableAlias, newTask);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
+ }
+
+ // insert current common join task to conditional task
+ listWorks.add(currTask.getWork());
+ listTasks.add(currTask);
+ // clear JoinTree and OP Parse Context
+ currWork.setOpParseCtxMap(null);
+ currWork.setJoinTree(null);
+
+ // create conditional task and insert conditional task into task tree
+ ConditionalWork cndWork = new ConditionalWork(listWorks);
+ ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
+ cndTsk.setListTasks(listTasks);
+
+ // set resolver and resolver context
+ cndTsk.setResolver(new ConditionalResolverCommonJoin());
+ ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
+ resolverCtx.setPathToAliases(pathToAliases);
+ resolverCtx.setAliasToKnownSize(aliasToSize);
+ resolverCtx.setAliasToTask(aliasToTask);
+ resolverCtx.setCommonJoinTask(currTask);
+ resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
+ resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
+ cndTsk.setResolverCtx(resolverCtx);
+
+ // replace the current task with the new generated conditional task
+ replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext);
+ return cndTsk;
+ }
+
+ private boolean reducerAllowedSMBJoinOp(Operator extends OperatorDesc> reducer) {
+ while (reducer != null) {
+ if (!reducer.opAllowedConvertSortMergeJoin()) {
+ return false;
+ }
+
+ List> childOps = reducer.getChildOperators();
+ if ((childOps == null) || (childOps.isEmpty())) {
+ return true;
+ }
+
+ // multi-table inserts not supported
+ if (childOps.size() > 1) {
+ return false;
+ }
+ reducer = childOps.get(0);
+ }
+
+ return true;
+ }
+
+ private SMBMapJoinOperator getSMBMapJoinOp(Operator extends OperatorDesc> currOp,
+ Operator extends OperatorDesc> reducer) {
+ SMBMapJoinOperator ret = null;
+ while (true) {
+ if (currOp instanceof SMBMapJoinOperator) {
+ if (ret != null) {
+ return null;
+ }
+ ret = (SMBMapJoinOperator) currOp;
+ }
+
+ if (!currOp.opAllowedConvertSortMergeJoin()) {
+ return null;
+ }
+
+ List> childOps = currOp.getChildOperators();
+ if ((childOps == null) || (childOps.isEmpty())) {
+ return reducerAllowedSMBJoinOp(reducer) ? ret : null;
+ }
+
+ // multi-table inserts not supported
+ if (childOps.size() > 1) {
+ return null;
+ }
+ currOp = childOps.get(0);
+ }
+ }
+
+ private SMBMapJoinOperator getSMBMapJoinOp(MapredWork work) throws SemanticException {
+ if (work != null) {
+ Operator extends OperatorDesc> reducer = work.getReducer();
+ for (Operator extends OperatorDesc> op : work.getAliasToWork().values()) {
+ SMBMapJoinOperator smbMapJoinOp = getSMBMapJoinOp(op, reducer);
+ if (smbMapJoinOp != null) {
+ return smbMapJoinOp;
+ }
+ }
+ }
+ return null;
+ }
+
+ private MapJoinOperator getMapJoinOperator(MapRedTask task,
+ MapredWork work,
+ SMBMapJoinOperator oldSMBJoinOp,
+ QBJoinTree joinTree,
+ int mapJoinPos) throws SemanticException {
+ SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(task.getWork());
+
+ // Add the row resolver for the new operator
+ Map, OpParseContext> opParseContextMap =
+ physicalContext.getParseContext().getOpParseCtx();
+ opParseContextMap.put(newSMBJoinOp, opParseContextMap.get(oldSMBJoinOp));
+
+ // generate the map join operator
+ return MapJoinProcessor.convertSMBJoinToMapJoin(opParseContextMap, newSMBJoinOp,
+ joinTree, mapJoinPos, true);
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (working copy)
@@ -17,46 +17,13 @@
*/
package org.apache.hadoop.hive.ql.optimizer.physical;
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.io.Serializable;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
-import java.util.Map;
-import java.util.Stack;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.ContentSummary;
-import org.apache.hadoop.hive.common.ObjectPair;
-import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.ql.Context;
-import org.apache.hadoop.hive.ql.exec.ConditionalTask;
-import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
-import org.apache.hadoop.hive.ql.exec.JoinOperator;
-import org.apache.hadoop.hive.ql.exec.MapRedTask;
-import org.apache.hadoop.hive.ql.exec.Operator;
-import org.apache.hadoop.hive.ql.exec.Task;
-import org.apache.hadoop.hive.ql.exec.TaskFactory;
-import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.TaskGraphWalker;
-import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext;
-import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor;
-import org.apache.hadoop.hive.ql.parse.ParseContext;
-import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin;
-import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx;
-import org.apache.hadoop.hive.ql.plan.ConditionalWork;
-import org.apache.hadoop.hive.ql.plan.JoinDesc;
-import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
-import org.apache.hadoop.hive.ql.plan.MapredWork;
-import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.hive.ql.plan.PartitionDesc;
/*
* Convert tasks involving JOIN into MAPJOIN.
@@ -105,494 +72,11 @@
TaskGraphWalker ogw = new TaskGraphWalker(disp);
// get all the tasks nodes from root task
- ArrayList topNodes = new ArrayList();
+ List topNodes = new ArrayList();
topNodes.addAll(pctx.rootTasks);
// begin to walk through the task tree.
ogw.startWalking(topNodes, null);
return pctx;
}
-
- /**
- * Iterator each tasks. If this task has a local work,create a new task for this local work, named
- * MapredLocalTask. then make this new generated task depends on current task's parent task, and
- * make current task depends on this new generated task
- */
- class CommonJoinTaskDispatcher implements Dispatcher {
-
- private final PhysicalContext physicalContext;
-
- public CommonJoinTaskDispatcher(PhysicalContext context) {
- super();
- physicalContext = context;
- }
-
- // Get the position of the big table for this join operator and the given alias
- private int getPosition(MapredWork work, Operator extends OperatorDesc> joinOp,
- String alias) {
- Operator extends OperatorDesc> parentOp = work.getAliasToWork().get(alias);
-
- // reduceSinkOperator's child is null, but joinOperator's parents is reduceSink
- while ((parentOp.getChildOperators() != null) &&
- (!parentOp.getChildOperators().isEmpty())) {
- parentOp = parentOp.getChildOperators().get(0);
- }
-
- return joinOp.getParentOperators().indexOf(parentOp);
- }
-
- /*
- * A task and its child task has been converted from join to mapjoin.
- * See if the two tasks can be merged.
- */
- private void mergeMapJoinTaskWithChildMapJoinTask(MapRedTask task) {
- MapRedTask childTask = (MapRedTask)task.getChildTasks().get(0);
- MapredWork work = task.getWork();
- MapredLocalWork localWork = work.getMapLocalWork();
- MapredWork childWork = childTask.getWork();
- MapredLocalWork childLocalWork = childWork.getMapLocalWork();
-
- // Can this be merged
- Map> aliasToWork = work.getAliasToWork();
- if (aliasToWork.size() > 1) {
- return;
- }
-
- Operator extends OperatorDesc> op = aliasToWork.values().iterator().next();
- while (op.getChildOperators() != null) {
- // Dont perform this optimization for multi-table inserts
- if (op.getChildOperators().size() > 1) {
- return;
- }
- op = op.getChildOperators().get(0);
- }
-
- if (!(op instanceof FileSinkOperator)) {
- return;
- }
-
- FileSinkOperator fop = (FileSinkOperator)op;
- String workDir = fop.getConf().getDirName();
-
- Map> childPathToAliases = childWork.getPathToAliases();
- if (childPathToAliases.size() > 1) {
- return;
- }
-
- // The filesink writes to a different directory
- if (!childPathToAliases.keySet().iterator().next().equals(workDir)) {
- return;
- }
-
- // Either of them should not be bucketed
- if ((localWork.getBucketMapjoinContext() != null) ||
- (childLocalWork.getBucketMapjoinContext() != null)) {
- return;
- }
-
- // Merge the trees
- if (childWork.getAliasToWork().size() > 1) {
- return;
- }
-
- Operator extends Serializable> childAliasOp =
- childWork.getAliasToWork().values().iterator().next();
- if (fop.getParentOperators().size() > 1) {
- return;
- }
-
- // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the
- // top of the second
- Operator extends Serializable> parentFOp = fop.getParentOperators().get(0);
- parentFOp.getChildOperators().remove(fop);
- parentFOp.getChildOperators().add(childAliasOp);
- List> parentOps =
- new ArrayList>();
- parentOps.add(parentFOp);
- childAliasOp.setParentOperators(parentOps);
-
- work.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo());
- for (Map.Entry childWorkEntry :
- childWork.getPathToPartitionInfo().entrySet()) {
- if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) {
- work.getPathToPartitionInfo().put(childWorkEntry.getKey(), childWorkEntry.getValue());
- }
- }
-
- localWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork());
- localWork.getAliasToWork().putAll(childLocalWork.getAliasToWork());
-
- // remove the child task
- List> oldChildTasks = childTask.getChildTasks();
- task.setChildTasks(oldChildTasks);
- if (oldChildTasks != null) {
- for (Task extends Serializable> oldChildTask : oldChildTasks) {
- oldChildTask.getParentTasks().remove(childTask);
- oldChildTask.getParentTasks().add(task);
- }
- }
- }
-
- // create map join task and set big table as bigTablePosition
- private ObjectPair convertTaskToMapJoinTask(String xml,
- int bigTablePosition) throws UnsupportedEncodingException, SemanticException {
- // deep copy a new mapred work from xml
- InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8"));
- MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf());
- // create a mapred task for this work
- MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
- .getParseContext().getConf());
- JoinOperator newJoinOp = getJoinOp(newTask);
-
- // optimize this newWork and assume big table position is i
- String bigTableAlias =
- MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, bigTablePosition);
- return new ObjectPair(newTask, bigTableAlias);
- }
-
- private Task extends Serializable> processCurrentTask(MapRedTask currTask,
- ConditionalTask conditionalTask, Context context)
- throws SemanticException {
-
- // whether it contains common join op; if contains, return this common join op
- JoinOperator joinOp = getJoinOp(currTask);
- if (joinOp == null) {
- return null;
- }
- currTask.setTaskTag(Task.COMMON_JOIN);
-
- MapredWork currWork = currTask.getWork();
-
- // create conditional work list and task list
- List listWorks = new ArrayList();
- List> listTasks = new ArrayList>();
-
- // create alias to task mapping and alias to input file mapping for resolver
- HashMap> aliasToTask = new HashMap>();
- HashMap> pathToAliases = currWork.getPathToAliases();
- Map> aliasToWork = currWork.getAliasToWork();
-
- // get parseCtx for this Join Operator
- ParseContext parseCtx = physicalContext.getParseContext();
- QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp);
-
- // start to generate multiple map join tasks
- JoinDesc joinDesc = joinOp.getConf();
- Byte[] order = joinDesc.getTagOrder();
- int numAliases = order.length;
-
- long aliasTotalKnownInputSize = 0;
- HashMap aliasToSize = new HashMap();
- try {
- // go over all the input paths, and calculate a known total size, known
- // size for each input alias.
- Utilities.getInputSummary(context, currWork, null).getLength();
-
- // set alias to size mapping, this can be used to determine if one table
- // is choosen as big table, what's the total size of left tables, which
- // are going to be small tables.
- for (Map.Entry> entry : pathToAliases.entrySet()) {
- String path = entry.getKey();
- List aliasList = entry.getValue();
- ContentSummary cs = context.getCS(path);
- if (cs != null) {
- long size = cs.getLength();
- for (String alias : aliasList) {
- aliasTotalKnownInputSize += size;
- Long es = aliasToSize.get(alias);
- if (es == null) {
- es = new Long(0);
- }
- es += size;
- aliasToSize.put(alias, es);
- }
- }
- }
-
- HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
-
- // no table could be the big table; there is no need to convert
- if (bigTableCandidates == null) {
- return null;
- }
-
- Configuration conf = context.getConf();
-
- // If sizes of atleast n-1 tables in a n-way join is known, and their sum is smaller than
- // the threshold size, convert the join into map-join and don't create a conditional task
- boolean convertJoinMapJoin = HiveConf.getBoolVar(conf,
- HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
- int bigTablePosition = -1;
- if (convertJoinMapJoin) {
- // This is the threshold that the user has specified to fit in mapjoin
- long mapJoinSize = HiveConf.getLongVar(conf,
- HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
-
- boolean bigTableFound = false;
- long largestTableSize = 0;
- long sumTableSizes = 0;
- for (String alias : aliasToWork.keySet()) {
- Long size = aliasToSize.get(alias);
- // The size is not available at compile time if the input is a sub-query.
- // If the size of atleast n-1 inputs for a n-way join are available at compile time,
- // and the sum of them is less than the specified threshold, then convert the join
- // into a map-join without the conditional task.
- if ((size == null) || (size > mapJoinSize)) {
- sumTableSizes += largestTableSize;
- if (bigTableFound || (sumTableSizes > mapJoinSize)) {
- convertJoinMapJoin = false;
- break;
- }
- bigTableFound = true;
- bigTablePosition = getPosition(currWork, joinOp, alias);
- largestTableSize = mapJoinSize + 1;
- } else {
- if (size > largestTableSize) {
- sumTableSizes += largestTableSize;
- largestTableSize = size;
- bigTablePosition = getPosition(currWork, joinOp, alias);
- } else {
- sumTableSizes += size;
- }
- if (sumTableSizes > mapJoinSize) {
- convertJoinMapJoin = false;
- break;
- }
- }
- }
- }
-
- String bigTableAlias = null;
- currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
- currWork.setJoinTree(joinTree);
- String xml = currWork.toXML();
-
- if (convertJoinMapJoin) {
- // create map join task and set big table as bigTablePosition
- MapRedTask newTask = convertTaskToMapJoinTask(xml, bigTablePosition).getFirst();
-
- newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
- replaceTask(currTask, newTask, physicalContext);
-
- // Can this task be merged with the child task. This can happen if a big table is being
- // joined with multiple small tables on different keys
- // Further optimizations are possible here, a join which has been converted to a mapjoin
- // followed by a mapjoin can be performed in a single MR job.
- if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)
- && (newTask.getChildTasks().get(0).getTaskTag() == Task.MAPJOIN_ONLY_NOBACKUP)) {
- mergeMapJoinTaskWithChildMapJoinTask(newTask);
- }
-
- return newTask;
- }
-
- long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
- HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
- for (int i = 0; i < numAliases; i++) {
- // this table cannot be big table
- if (!bigTableCandidates.contains(i)) {
- continue;
- }
-
- // create map join task and set big table as i
- ObjectPair newTaskAlias = convertTaskToMapJoinTask(xml, i);
- MapRedTask newTask = newTaskAlias.getFirst();
- bigTableAlias = newTaskAlias.getSecond();
-
- Long aliasKnownSize = aliasToSize.get(bigTableAlias);
- if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) {
- long smallTblTotalKnownSize = aliasTotalKnownInputSize
- - aliasKnownSize.longValue();
- if(smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
- //this table is not good to be a big table.
- continue;
- }
- }
-
- // add into conditional task
- listWorks.add(newTask.getWork());
- listTasks.add(newTask);
- newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
-
- //set up backup task
- newTask.setBackupTask(currTask);
- newTask.setBackupChildrenTasks(currTask.getChildTasks());
-
- // put the mapping alias to task
- aliasToTask.put(bigTableAlias, newTask);
- }
- } catch (Exception e) {
- e.printStackTrace();
- throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
- }
-
- // insert current common join task to conditional task
- listWorks.add(currTask.getWork());
- listTasks.add(currTask);
- // clear JoinTree and OP Parse Context
- currWork.setOpParseCtxMap(null);
- currWork.setJoinTree(null);
-
- // create conditional task and insert conditional task into task tree
- ConditionalWork cndWork = new ConditionalWork(listWorks);
- ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
- cndTsk.setListTasks(listTasks);
-
- // set resolver and resolver context
- cndTsk.setResolver(new ConditionalResolverCommonJoin());
- ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
- resolverCtx.setPathToAliases(pathToAliases);
- resolverCtx.setAliasToKnownSize(aliasToSize);
- resolverCtx.setAliasToTask(aliasToTask);
- resolverCtx.setCommonJoinTask(currTask);
- resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
- resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
- cndTsk.setResolverCtx(resolverCtx);
-
- //replace the current task with the new generated conditional task
- this.replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext);
- return cndTsk;
- }
-
- private void replaceTaskWithConditionalTask(
- Task extends Serializable> currTask, ConditionalTask cndTsk,
- PhysicalContext physicalContext) {
- // add this task into task tree
- // set all parent tasks
- List> parentTasks = currTask.getParentTasks();
- currTask.setParentTasks(null);
- if (parentTasks != null) {
- for (Task extends Serializable> tsk : parentTasks) {
- // make new generated task depends on all the parent tasks of current task.
- tsk.addDependentTask(cndTsk);
- // remove the current task from its original parent task's dependent task
- tsk.removeDependentTask(currTask);
- }
- } else {
- // remove from current root task and add conditional task to root tasks
- physicalContext.removeFromRootTask(currTask);
- physicalContext.addToRootTask(cndTsk);
- }
- // set all child tasks
- List> oldChildTasks = currTask.getChildTasks();
- if (oldChildTasks != null) {
- for (Task extends Serializable> tsk : cndTsk.getListTasks()) {
- if (tsk.equals(currTask)) {
- continue;
- }
- for (Task extends Serializable> oldChild : oldChildTasks) {
- tsk.addDependentTask(oldChild);
- }
- }
- }
- }
-
- // Replace the task with the new task. Copy the children and parents of the old
- // task to the new task.
- private void replaceTask(
- Task extends Serializable> currTask, Task extends Serializable> newTask,
- PhysicalContext physicalContext) {
- // add this task into task tree
- // set all parent tasks
- List> parentTasks = currTask.getParentTasks();
- currTask.setParentTasks(null);
- if (parentTasks != null) {
- for (Task extends Serializable> tsk : parentTasks) {
- // make new generated task depends on all the parent tasks of current task.
- tsk.addDependentTask(newTask);
- // remove the current task from its original parent task's dependent task
- tsk.removeDependentTask(currTask);
- }
- } else {
- // remove from current root task and add conditional task to root tasks
- physicalContext.removeFromRootTask(currTask);
- physicalContext.addToRootTask(newTask);
- }
-
- // set all child tasks
- List> oldChildTasks = currTask.getChildTasks();
- currTask.setChildTasks(null);
- if (oldChildTasks != null) {
- for (Task extends Serializable> tsk : oldChildTasks) {
- // make new generated task depends on all the parent tasks of current task.
- newTask.addDependentTask(tsk);
- // remove the current task from its original parent task's dependent task
- tsk.getParentTasks().remove(currTask);
- }
- }
- }
-
- @Override
- public Object dispatch(Node nd, Stack stack, Object... nodeOutputs)
- throws SemanticException {
- if (nodeOutputs == null || nodeOutputs.length == 0) {
- throw new SemanticException("No Dispatch Context");
- }
-
- TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0];
-
- Task extends Serializable> currTask = (Task extends Serializable>) nd;
- // not map reduce task or not conditional task, just skip
- if (currTask.isMapRedTask()) {
- if (currTask instanceof ConditionalTask) {
- // get the list of task
- List> taskList = ((ConditionalTask) currTask).getListTasks();
- for (Task extends Serializable> tsk : taskList) {
- if (tsk.isMapRedTask()) {
- Task extends Serializable> newTask = this.processCurrentTask((MapRedTask) tsk,
- ((ConditionalTask) currTask), physicalContext.getContext());
- walkerCtx.addToDispatchList(newTask);
- }
- }
- } else {
- Task extends Serializable> newTask =
- this.processCurrentTask((MapRedTask) currTask, null, physicalContext.getContext());
- walkerCtx.addToDispatchList(newTask);
- }
- }
- return null;
- }
-
- /*
- * If any operator which does not allow map-side conversion is present in the mapper, dont
- * convert it into a conditional task.
- */
- private boolean checkOperatorOKMapJoinConversion(Operator extends OperatorDesc> op) {
- if (!op.opAllowedConvertMapJoin()) {
- return false;
- }
-
- if (op.getChildOperators() == null) {
- return true;
- }
-
- for (Operator extends OperatorDesc> childOp : op.getChildOperators()) {
- if (!checkOperatorOKMapJoinConversion(childOp)) {
- return false;
- }
- }
-
- return true;
- }
-
- private JoinOperator getJoinOp(MapRedTask task) throws SemanticException {
- MapredWork work = task.getWork();
- if (work == null) {
- return null;
- }
- Operator extends OperatorDesc> reducerOp = work.getReducer();
- if (reducerOp instanceof JoinOperator) {
- /* Is any operator present, which prevents the conversion */
- Map> aliasToWork = work.getAliasToWork();
- for (Operator extends OperatorDesc> op : aliasToWork.values()) {
- if (!checkOperatorOKMapJoinConversion(op)) {
- return null;
- }
- }
- return (JoinOperator) reducerOp;
- } else {
- return null;
- }
- }
- }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (working copy)
@@ -274,4 +274,9 @@
// optimizations for now.
return false;
}
+
+ @Override
+ public boolean opAllowedConvertSortMergeJoin() {
+ return false;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java (working copy)
@@ -166,5 +166,4 @@
protected boolean hasAnyNulls(AbstractMapJoinKey key) {
return key.hasAnyNulls(nullsafes);
}
-
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (working copy)
@@ -80,6 +80,7 @@
// hive.auto.convert.join.noconditionaltask is set to true. No conditional task was
// created in case the mapjoin failed.
public static final int MAPJOIN_ONLY_NOBACKUP = 6;
+ public static final int CONVERTED_SORTMERGEJOIN = 7;
// Descendants tasks who subscribe feeds from this task
protected transient List> feedSubscribers;
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/UnionOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/UnionOperator.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/UnionOperator.java (working copy)
@@ -163,4 +163,9 @@
public boolean opAllowedAfterMapJoin() {
return false;
}
+
+ @Override
+ public boolean opAllowedConvertSortMergeJoin() {
+ return false;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy)
@@ -1514,4 +1514,13 @@
public boolean opAllowedConvertMapJoin() {
return true;
}
+
+ /*
+ * If this task contains a sortmergejoin, it can be converted to a map-join task if this operator
+ * is present in the mapper. For eg. if a sort-merge join operator is present followed by a
+ * regular join, it cannot be converted to a auto map-join.
+ */
+ public boolean opAllowedConvertSortMergeJoin() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverCommonJoin.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverCommonJoin.java (revision 1447595)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverCommonJoin.java (working copy)
@@ -71,7 +71,7 @@
}
public HashMap getAliasToKnownSize() {
- return aliasToKnownSize;
+ return aliasToKnownSize == null ? new HashMap() : aliasToKnownSize;
}
public void setAliasToKnownSize(HashMap aliasToKnownSize) {