Index: ql/src/test/results/clientpositive/smb_mapjoin_14.q.out =================================================================== --- ql/src/test/results/clientpositive/smb_mapjoin_14.q.out (revision 0) +++ ql/src/test/results/clientpositive/smb_mapjoin_14.q.out (working copy) @@ -0,0 +1,855 @@ +PREHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl1 +PREHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl2 +PREHOOK: query: insert overwrite table tbl1 +select * from src where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tbl1 +POSTHOOK: query: insert overwrite table tbl1 +select * from src where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tbl1 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table tbl2 +select * from src where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tbl2 +POSTHOOK: query: insert overwrite table tbl2 +select * from src where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tbl2 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- The mapjoin is being performed as part of sub-query. It should be still converted to a sort-merge join +explain +select count(*) from ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- The mapjoin is being performed as part of sub-query. It should be still converted to a sort-merge join +explain +select count(*) from ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +22 +PREHOOK: query: -- The mapjoin is being performed as part of sub-query. It should be still converted to a sort-merge join +explain +select key, count(*) from +( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +PREHOOK: type: QUERY +POSTHOOK: query: -- The mapjoin is being performed as part of sub-query. It should be still converted to a sort-merge join +explain +select key, count(*) from +( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, count(*) from +( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select key, count(*) from +( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 9 +2 1 +4 1 +5 9 +8 1 +9 1 +PREHOOK: query: -- The mapjoin is being performed as part of more than one sub-query. It should be still converted to a sort-merge join +explain +select count(*) from +( + select key, count(*) from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +PREHOOK: type: QUERY +POSTHOOK: query: -- The mapjoin is being performed as part of more than one sub-query. It should be still converted to a sort-merge join +explain +select count(*) from +( + select key, count(*) from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq2:subq1:b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) from +( + select key, count(*) from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from +( + select key, count(*) from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +6 +PREHOOK: query: -- A join is being performed across different sub-queries, where a mapjoin is being performed in each of them. +explain +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +PREHOOK: type: QUERY +POSTHOOK: query: -- A join is being performed across different sub-queries, where a mapjoin is being performed in each of them. +explain +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) src1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) value) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) val2)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) src2) (= (. (TOK_TABLE_OR_COL src1) key) (. (TOK_TABLE_OR_COL src2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) cnt1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src2) cnt1))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2, Stage-6 + Stage-5 is a root stage + Stage-6 depends on stages: Stage-5 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src1:subq1:b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: 0 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: 1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col3 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + expr: _col3 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + src2:subq2:b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl1 +PREHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: query: select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl1 +POSTHOOK: Input: default@tbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 9 9 +2 1 1 +4 1 1 +5 9 9 +8 1 1 +9 1 1 Index: ql/src/test/queries/clientpositive/smb_mapjoin_14.q =================================================================== --- ql/src/test/queries/clientpositive/smb_mapjoin_14.q (revision 0) +++ ql/src/test/queries/clientpositive/smb_mapjoin_14.q (working copy) @@ -0,0 +1,191 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 1; + +CREATE TABLE tbl1(key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE tbl2(key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +insert overwrite table tbl1 +select * from src where key < 20; + +insert overwrite table tbl2 +select * from src where key < 20; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a) subq1 + join + (select a.key as key, a.value as value from tbl2 a) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a) subq1 + join + (select a.key as key, a.value as value from tbl2 a) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +set hive.optimize.bucketmapjoin = true; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a) subq1 + join + (select a.key as key, a.value as value from tbl2 a) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +set hive.optimize.bucketmapjoin.sortedmerge = true; +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a) subq1 + join + (select a.key as key, a.value as value from tbl2 a) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; + +select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key; + +-- The mapjoin is being performed as part of sub-query. It should be still converted to a sort-merge join +explain +select count(*) from ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1; + +select count(*) from ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1; + +-- The mapjoin is being performed as part of sub-query. It should be still converted to a sort-merge join +-- Add a order by at the end to make the results deterministic. +explain +select key, count(*) from +( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key; + +select key, count(*) from +( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key +) subq1 +group by key +order by key; + +-- The mapjoin is being performed as part of more than one sub-query. It should be still converted to a sort-merge join +explain +select count(*) from +( + select key, count(*) from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2; + +select count(*) from +( + select key, count(*) from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 + group by key +) subq2; + +-- A join is being performed across different sub-queries, where a mapjoin is being performed in each of them. +explain +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key; + +select src1.key, src1.cnt1, src2.cnt1 from +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq1 group by key +) src1 +join +( + select key, count(*) as cnt1 from + ( + select /*+mapjoin(a)*/ a.key as key, a.value as val1, b.value as val2 from tbl1 a join tbl2 b on a.key = b.key + ) subq2 group by key +) src2 +on src1.key = src2.key; + +explain +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, a.value as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, a.value as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +explain +select /*+mapjoin(subq1)*/ * from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key as key, concat(a.value, a.value) as value from tbl1 a where key < 10) subq1 + join + (select a.key as key, concat(a.value, a.value) as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +explain +select /*+mapjoin(subq1)*/ * from + (select a.key +1 as key, concat(a.value, a.value) as value from tbl1 a where key < 10) subq1 + join + (select a.key +1 as key, concat(a.value, a.value) as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; + +select /*+mapjoin(subq1)*/ * from + (select a.key +1 as key, concat(a.value, a.value) as value from tbl1 a where key < 10) subq1 + join + (select a.key +1 as key, concat(a.value, a.value) as value from tbl2 a where key < 10) subq2 + on subq1.key = subq2.key; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1411058) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy) @@ -30,7 +30,6 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.ErrorMsg; -import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; @@ -52,12 +51,11 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; //try to replace a bucket map join with a sorted merge map join public class SortedMergeBucketMapJoinOptimizer implements Transform { @@ -134,6 +132,13 @@ return false; } String[] srcs = joinCxt.getBaseSrc(); + String[] newSrcs = new String[srcs.length]; + for (int srcPos = 0; srcPos < srcs.length; srcPos++) { + newSrcs[srcPos] = + joinCxt.getId() == null ? srcs[srcPos] : joinCxt.getId() + ":" + srcs[srcPos]; + } + + srcs = newSrcs; int pos = 0; // All the tables/partitions columns should be sorted in the same order @@ -147,9 +152,9 @@ && isTableSorted(this.pGraphContext, mapJoinOp, joinCxt, - src, pos, - sortColumnsFirstTable); + sortColumnsFirstTable, + srcs); pos++; } if (!tableSorted) { @@ -213,6 +218,17 @@ return smbJop; } + public List toColumns(List keys) { + List columns = new ArrayList(); + for (ExprNodeDesc key : keys) { + if (!(key instanceof ExprNodeColumnDesc)) { + return null; + } + columns.add(((ExprNodeColumnDesc) key).getColumn()); + } + return columns; + } + /** * Whether this table is eligible for a sort-merge join. * @@ -229,40 +245,44 @@ private boolean isTableSorted(ParseContext pctx, MapJoinOperator op, QBJoinTree joinTree, - String alias, int pos, - List sortColumnsFirstTable) + List sortColumnsFirstTable, + String[] aliases) throws SemanticException { - - Map> topOps = this.pGraphContext - .getTopOps(); + String alias = aliases[pos]; Map topToTable = this.pGraphContext .getTopToTable(); - TableScanOperator tso = (TableScanOperator) topOps.get(alias); + + Operator topOp = joinTree.getAliasToOpInfo().get(alias); + if (topOp == null) { + return false; + } + List joinCols = toColumns(op.getConf().getKeys().get((byte) pos)); + if (joinCols == null || joinCols.isEmpty()) { + return false; + } + TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols); if (tso == null) { return false; } - List keys = op.getConf().getKeys().get((byte) pos); - // get all join columns from join keys stored in MapJoinDesc - List joinCols = new ArrayList(); - List joinKeys = new ArrayList(); - joinKeys.addAll(keys); - while (joinKeys.size() > 0) { - ExprNodeDesc node = joinKeys.remove(0); - if (node instanceof ExprNodeColumnDesc) { - joinCols.addAll(node.getCols()); - } else if (node instanceof ExprNodeGenericFuncDesc) { - ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node); - GenericUDF udf = udfNode.getGenericUDF(); - if (!FunctionRegistry.isDeterministic(udf)) { - return false; + // For nested sub-queries, the alias mapping is not maintained in QB currently. + if (pGraphContext.getTopOps().containsValue(tso)) { + for (Map.Entry> topOpEntry : + this.pGraphContext.getTopOps().entrySet()) { + if (topOpEntry.getValue() == tso) { + alias = topOpEntry.getKey(); + aliases[pos] = alias; + break; } - joinKeys.addAll(0, udfNode.getChildExprs()); } } + else { + return false; + } Table tbl = topToTable.get(tso); + if (tbl.isPartitioned()) { PrunedPartitionList prunedParts = null; try { Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1411058) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy) @@ -40,10 +40,10 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; -import org.apache.hadoop.hive.ql.exec.UnionOperator; +import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; -import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; @@ -170,19 +171,26 @@ String[] left = joinCxt.getLeftAliases(); List mapAlias = joinCxt.getMapAliases(); String baseBigAlias = null; - for(String s : left) { - if(s != null && !joinAliases.contains(s)) { - joinAliases.add(s); - if(!mapAlias.contains(s)) { - baseBigAlias = s; + for (String s : left) { + if (s != null) { + String subQueryAlias = joinCxt.getId() == null ? s : joinCxt.getId() + ":" + s; + if (!joinAliases.contains(subQueryAlias)) { + joinAliases.add(subQueryAlias); + if(!mapAlias.contains(s)) { + baseBigAlias = subQueryAlias; + } } } } - for(String s : srcs) { - if(s != null && !joinAliases.contains(s)) { - joinAliases.add(s); - if(!mapAlias.contains(s)) { - baseBigAlias = s; + + for (String s : srcs) { + if (s != null) { + String subQueryAlias = joinCxt.getId() == null ? s : joinCxt.getId() + ":" + s; + if (!joinAliases.contains(subQueryAlias)) { + joinAliases.add(subQueryAlias); + if(!mapAlias.contains(s)) { + baseBigAlias = subQueryAlias; + } } } } @@ -206,14 +214,43 @@ boolean bigTablePartitioned = true; for (int index = 0; index < joinAliases.size(); index++) { String alias = joinAliases.get(index); - TableScanOperator tso = (TableScanOperator) topOps.get(alias); - if (tso == null) { + Operator topOp = joinCxt.getAliasToOpInfo().get(alias); + if (topOp == null) { return false; } List keys = toColumns(mjDesc.getKeys().get((byte) index)); if (keys == null || keys.isEmpty()) { return false; } + int keysSize = keys.size(); + TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys); + if (tso == null) { + return false; + } + + // For nested sub-queries, the alias mapping is not maintained in QB currently. + if (pGraphContext.getTopOps().containsValue(tso)) { + for (Map.Entry> topOpEntry : topOps.entrySet()) { + if (topOpEntry.getValue() == tso) { + String newAlias = topOpEntry.getKey(); + joinAliases.set(index, newAlias); + if (baseBigAlias.equals(alias)) { + baseBigAlias = newAlias; + } + alias = newAlias; + break; + } + } + } + else { + return false; + } + + // In case of a constant join key, return false + if (keys.size() != keysSize) { + return false; + } + if (orders == null) { orders = new Integer[keys.size()]; } @@ -374,7 +411,7 @@ return null; } - private List toColumns(List keys) { + public List toColumns(List keys) { List columns = new ArrayList(); for (ExprNodeDesc key : keys) { if (!(key instanceof ExprNodeColumnDesc)) { Index: ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java (revision 1411058) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java (working copy) @@ -22,8 +22,12 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Map.Entry; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + /** * Internal representation of the join tree. * @@ -39,7 +43,12 @@ private JoinCond[] joinCond; private boolean noOuterJoin; private boolean noSemiJoin; + private Map> aliasToOpInfo; + // The subquery identifier from QB. + // It is of the form topSubQuery:innerSubQuery:....:innerMostSubQuery + private String id; + // keeps track of the right-hand-side table name of the left-semi-join, and // its list of join keys private final HashMap> rhsSemijoin; @@ -74,6 +83,7 @@ noOuterJoin = true; noSemiJoin = true; rhsSemijoin = new HashMap>(); + aliasToOpInfo = new HashMap>(); } /** @@ -320,4 +330,20 @@ public void setFilterMap(int[][] filterMap) { this.filterMap = filterMap; } + + public Map> getAliasToOpInfo() { + return aliasToOpInfo; + } + + public void setAliasToOpInfo(Map> aliasToOpInfo) { + this.aliasToOpInfo = aliasToOpInfo; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1411058) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -5568,7 +5568,7 @@ } private Operator genJoinOperator(QB qb, QBJoinTree joinTree, - HashMap map) throws SemanticException { + Map map) throws SemanticException { QBJoinTree leftChild = joinTree.getJoinSrc(); Operator joinSrcOp = null; if (leftChild != null) { @@ -5767,7 +5767,7 @@ } } - private Operator genJoinPlan(QB qb, HashMap map) + private Operator genJoinPlan(QB qb, Map map) throws SemanticException { QBJoinTree joinTree = qb.getQbJoinTree(); Operator joinOp = genJoinOperator(qb, joinTree, map); @@ -5779,7 +5779,7 @@ * source operators. This procedure traverses the query tree recursively, */ private void pushJoinFilters(QB qb, QBJoinTree joinTree, - HashMap map) throws SemanticException { + Map map) throws SemanticException { if (joinTree.getJoinSrc() != null) { pushJoinFilters(qb, joinTree.getJoinSrc(), map); } @@ -5819,7 +5819,8 @@ return cols; } - private QBJoinTree genUniqueJoinTree(QB qb, ASTNode joinParseTree) + private QBJoinTree genUniqueJoinTree(QB qb, ASTNode joinParseTree, + Map aliasToOpInfo) throws SemanticException { QBJoinTree joinTree = new QBJoinTree(); joinTree.setNoOuterJoin(false); @@ -5858,6 +5859,9 @@ } else { rightAliases.add(alias); } + joinTree.getAliasToOpInfo().put( + qb.getId() == null ? alias : qb.getId() + ":" + alias, aliasToOpInfo.get(alias)); + joinTree.setId(qb.getId()); baseSrc.add(alias); preserved.add(lastPreserved); @@ -5915,7 +5919,8 @@ return joinTree; } - private QBJoinTree genJoinTree(QB qb, ASTNode joinParseTree) + private QBJoinTree genJoinTree(QB qb, ASTNode joinParseTree, + Map aliasToOpInfo) throws SemanticException { QBJoinTree joinTree = new QBJoinTree(); JoinCond[] condn = new JoinCond[1]; @@ -5962,8 +5967,11 @@ String[] children = new String[2]; children[0] = alias; joinTree.setBaseSrc(children); + joinTree.setId(qb.getId()); + joinTree.getAliasToOpInfo().put( + qb.getId() == null ? alias : qb.getId() + ":" + alias, aliasToOpInfo.get(alias)); } else if (isJoinToken(left)) { - QBJoinTree leftTree = genJoinTree(qb, left); + QBJoinTree leftTree = genJoinTree(qb, left, aliasToOpInfo); joinTree.setJoinSrc(leftTree); String[] leftChildAliases = leftTree.getLeftAliases(); String leftAliases[] = new String[leftChildAliases.length + 1]; @@ -5992,6 +6000,10 @@ } children[1] = alias; joinTree.setBaseSrc(children); + aliasToOpInfo.get(alias); + joinTree.setId(qb.getId()); + joinTree.getAliasToOpInfo().put( + qb.getId() == null ? alias : qb.getId() + ":" + alias, aliasToOpInfo.get(alias)); // remember rhs table for semijoin if (joinTree.getNoSemiJoin() == false) { joinTree.addRHSSemijoin(alias); @@ -6096,6 +6108,7 @@ rightAliases[i + trgtRightAliases.length] = nodeRightAliases[i]; } target.setRightAliases(rightAliases); + target.getAliasToOpInfo().putAll(node.getAliasToOpInfo()); String[] nodeBaseSrc = node.getBaseSrc(); String[] trgtBaseSrc = target.getBaseSrc(); @@ -7407,7 +7420,7 @@ public Operator genPlan(QB qb) throws SemanticException { // First generate all the opInfos for the elements in the from clause - HashMap aliasToOpInfo = new HashMap(); + Map aliasToOpInfo = new HashMap(); // Recurse over the subqueries to fill the subquery part of the plan for (String alias : qb.getSubqAliases()) { @@ -7433,10 +7446,10 @@ ASTNode joinExpr = qb.getParseInfo().getJoinExpr(); if (joinExpr.getToken().getType() == HiveParser.TOK_UNIQUEJOIN) { - QBJoinTree joinTree = genUniqueJoinTree(qb, joinExpr); + QBJoinTree joinTree = genUniqueJoinTree(qb, joinExpr, aliasToOpInfo); qb.setQbJoinTree(joinTree); } else { - QBJoinTree joinTree = genJoinTree(qb, joinExpr); + QBJoinTree joinTree = genJoinTree(qb, joinExpr, aliasToOpInfo); qb.setQbJoinTree(joinTree); mergeJoinTree(qb); } @@ -7472,7 +7485,7 @@ * @throws SemanticException */ - void genLateralViewPlans(HashMap aliasToOpInfo, QB qb) + void genLateralViewPlans(Map aliasToOpInfo, QB qb) throws SemanticException { Map> aliasToLateralViews = qb.getParseInfo() .getAliasToLateralViews(); Index: ql/src/java/org/apache/hadoop/hive/ql/parse/TableAccessAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/TableAccessAnalyzer.java (revision 1411058) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/TableAccessAnalyzer.java (working copy) @@ -226,10 +226,9 @@ * names on that table that map to the keys used for the input * operator (which is currently only a join or group by). */ - private static TableScanOperator genRootTableScan( + public static TableScanOperator genRootTableScan( Operator op, List keyNames) { - boolean complexTree = false; Operator currOp = op; List currColNames = keyNames; List> parentOps = null; @@ -238,26 +237,24 @@ // along the way that changes the rows from the table through // joins or aggregations. Only allowed operators are selects // and filters. - while (!complexTree) { + while (true) { parentOps = currOp.getParentOperators(); if (parentOps == null) { - break; + return (TableScanOperator) currOp; } if (parentOps.size() > 1 || !(currOp.columnNamesRowResolvedCanBeObtained())) { - complexTree = true; + return null; } else { // Generate the map of the input->output column name for the keys // we are about if (!TableAccessAnalyzer.genColNameMap(currOp, currColNames)) { - complexTree = true; + return null; } currOp = parentOps.get(0); } } - - return complexTree? null: (TableScanOperator) currOp; } /*