diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java index 8139f4a..da9423e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapRedTask; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -364,6 +365,17 @@ private void mergeMapJoinTaskWithMapReduceTask(MapRedTask mapJoinTask, Configura return; } + // remove the unnecessary TableScan + if (childAliasOp instanceof TableScanOperator) { + TableScanOperator tso = (TableScanOperator)childAliasOp; + if (tso.getNumChild() != 1) { + // shouldn't happen + return; + } + childAliasOp = tso.getChildOperators().get(0); + childAliasOp.getParentOperators().remove(tso); + } + // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the // top of the second Operator parentFOp = mapJoinTaskFileSinkOperator diff --git ql/src/test/queries/clientpositive/auto_join33.q ql/src/test/queries/clientpositive/auto_join33.q new file mode 100644 index 0000000..5c85842 --- /dev/null +++ ql/src/test/queries/clientpositive/auto_join33.q @@ -0,0 +1,16 @@ +set hive.auto.convert.join=true; +set hive.optimize.mapjoin.mapreduce=true; + +-- empty tables +create table studenttab10k (name string, age int, gpa double); +create table votertab10k (name string, age int, registration string, contributions float); + +explain select s.name, count(distinct registration) +from studenttab10k s join votertab10k v +on (s.name = v.name) +group by s.name; + +select s.name, count(distinct registration) +from studenttab10k s join votertab10k v +on (s.name = v.name) +group by s.name; diff --git ql/src/test/results/clientpositive/auto_join33.q.out ql/src/test/results/clientpositive/auto_join33.q.out new file mode 100644 index 0000000..8fc0e84 --- /dev/null +++ ql/src/test/results/clientpositive/auto_join33.q.out @@ -0,0 +1,148 @@ +PREHOOK: query: -- empty tables +create table studenttab10k (name string, age int, gpa double) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- empty tables +create table studenttab10k (name string, age int, gpa double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@studenttab10k +PREHOOK: query: create table votertab10k (name string, age int, registration string, contributions float) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table votertab10k (name string, age int, registration string, contributions float) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@votertab10k +PREHOOK: query: explain select s.name, count(distinct registration) +from studenttab10k s join votertab10k v +on (s.name = v.name) +group by s.name +PREHOOK: type: QUERY +POSTHOOK: query: explain select s.name, count(distinct registration) +from studenttab10k s join votertab10k v +on (s.name = v.name) +group by s.name +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME studenttab10k) s) (TOK_TABREF (TOK_TABNAME votertab10k) v) (= (. (TOK_TABLE_OR_COL s) name) (. (TOK_TABLE_OR_COL v) name)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL s) name)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL registration)))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL s) name)))) + +STAGE DEPENDENCIES: + Stage-5 is a root stage + Stage-4 depends on stages: Stage-5 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + s + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + s + TableScan + alias: s + HashTable Sink Operator + condition expressions: + 0 {name} + 1 {registration} + handleSkewJoin: false + keys: + 0 [Column[name]] + 1 [Column[name]] + Position of Big Table: 1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + v + TableScan + alias: v + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {name} + 1 {registration} + handleSkewJoin: false + keys: + 0 [Column[name]] + 1 [Column[name]] + outputColumnNames: _col0, _col7 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col7 + type: string + outputColumnNames: _col0, _col7 + Group By Operator + aggregations: + expr: count(DISTINCT _col7) + bucketGroup: false + keys: + expr: _col0 + type: string + expr: _col7 + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select s.name, count(distinct registration) +from studenttab10k s join votertab10k v +on (s.name = v.name) +group by s.name +PREHOOK: type: QUERY +PREHOOK: Input: default@studenttab10k +PREHOOK: Input: default@votertab10k +#### A masked pattern was here #### +POSTHOOK: query: select s.name, count(distinct registration) +from studenttab10k s join votertab10k v +on (s.name = v.name) +group by s.name +POSTHOOK: type: QUERY +POSTHOOK: Input: default@studenttab10k +POSTHOOK: Input: default@votertab10k +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/multiMapJoin1.q.out ql/src/test/results/clientpositive/multiMapJoin1.q.out index 542f3b4..3b3eb3f 100644 --- ql/src/test/results/clientpositive/multiMapJoin1.q.out +++ ql/src/test/results/clientpositive/multiMapJoin1.q.out @@ -598,18 +598,18 @@ STAGE PLANS: type: string mode: hash outputColumnNames: _col0, _col1 - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: -1 - value expressions: - expr: _col1 - type: bigint + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint Local Work: Map Reduce Local Work Reduce Operator Tree: @@ -1836,12 +1836,12 @@ STAGE PLANS: bucketGroup: false mode: hash outputColumnNames: _col0 - Reduce Output Operator - sort order: - tag: -1 - value expressions: - expr: _col0 - type: bigint + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint Local Work: Map Reduce Local Work Reduce Operator Tree: