Index: ql/src/test/results/clientpositive/ppd_outer_join5.q.out =================================================================== --- ql/src/test/results/clientpositive/ppd_outer_join5.q.out (revision 0) +++ ql/src/test/results/clientpositive/ppd_outer_join5.q.out (revision 0) @@ -0,0 +1,412 @@ +PREHOOK: query: create table t1 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t1 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t1 +PREHOOK: query: create table t2 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t2 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t2 +PREHOOK: query: create table t3 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t3 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t3 +PREHOOK: query: create table t4 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t4 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t4 +PREHOOK: query: explain select * from t1 full outer join t2 on t1.id=t2.id join t3 on t2.id=t3.id where t3.id=20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 full outer join t2 on t1.id=t2.id join t3 on t2.id=t3.id where t3.id=20 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_FULLOUTERJOIN (TOK_TABREF (TOK_TABNAME t1)) (TOK_TABREF (TOK_TABNAME t2)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t2) id))) (TOK_TABREF (TOK_TABNAME t3)) (= (. (TOK_TABLE_OR_COL t2) id) (. (TOK_TABLE_OR_COL t3) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL t3) id) 20)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 0 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t2 + TableScan + alias: t2 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 1 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t3 + TableScan + alias: t3 + Filter Operator + predicate: + expr: (id = 20) + type: boolean + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 2 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + Inner Join 1 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 2 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col10, _col11, _col12 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col5 + type: int + expr: _col6 + type: string + expr: _col7 + type: string + expr: _col10 + type: int + expr: _col11 + type: string + expr: _col12 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t2.id=t3.id) where t2.id=20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t2.id=t3.id) where t2.id=20 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME t1)) (TOK_TABREF (TOK_TABNAME t2)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t2) id))) (TOK_TABREF (TOK_TABNAME t3)) (= (. (TOK_TABLE_OR_COL t2) id) (. (TOK_TABLE_OR_COL t3) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL t2) id) 20)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 0 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t2 + TableScan + alias: t2 + Filter Operator + predicate: + expr: (id = 20) + type: boolean + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 1 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t3 + TableScan + alias: t3 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 2 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Left Outer Join1 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 2 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col10, _col11, _col12 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col5 + type: int + expr: _col6 + type: string + expr: _col7 + type: string + expr: _col10 + type: int + expr: _col11 + type: string + expr: _col12 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t1.id=t3.id) where t2.id=20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t1.id=t3.id) where t2.id=20 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME t1)) (TOK_TABREF (TOK_TABNAME t2)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t2) id))) (TOK_TABREF (TOK_TABNAME t3)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t3) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL t2) id) 20)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 0 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t2 + TableScan + alias: t2 + Filter Operator + predicate: + expr: (id = 20) + type: boolean + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 1 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t3 + TableScan + alias: t3 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 2 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Left Outer Join0 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 2 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col10, _col11, _col12 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col5 + type: int + expr: _col6 + type: string + expr: _col7 + type: string + expr: _col10 + type: int + expr: _col11 + type: string + expr: _col12 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: drop table t1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: drop table t1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: drop table t2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2 +PREHOOK: Output: default@t2 +POSTHOOK: query: drop table t2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@t2 +PREHOOK: query: drop table t3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t3 +PREHOOK: Output: default@t3 +POSTHOOK: query: drop table t3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@t3 +PREHOOK: query: drop table t4 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t4 +PREHOOK: Output: default@t4 +POSTHOOK: query: drop table t4 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t4 +POSTHOOK: Output: default@t4 Index: ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (revision 1153598) +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (working copy) @@ -23,9 +23,9 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.Stack; -import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -43,7 +43,6 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; -import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.RowResolver; @@ -230,19 +229,22 @@ /** * Figures out the aliases for whom it is safe to push predicates based on - * ANSI SQL semantics For inner join, all predicates for all aliases can be - * pushed For full outer join, none of the predicates can be pushed as that - * would limit the number of rows for join For left outer join, all the - * predicates on the left side aliases can be pushed up For right outer - * join, all the predicates on the right side aliases can be pushed up Joins - * chain containing both left and right outer joins are treated as full - * outer join. TODO: further optimization opportunity for the case a.c1 = - * b.c1 and b.c2 = c.c2 a and b are first joined and then the result with c. - * But the second join op currently treats a and b as separate aliases and - * thus disallowing predicate expr containing both tables a and b (such as - * a.c3 + a.c4 > 20). Such predicates also can be pushed just above the - * second join and below the first join + * ANSI SQL semantics. The join conditions are left associative so "a + * RIGHT OUTER JOIN b LEFT OUTER JOIN cÊINNER JOIN d" is interpreted as + * "((a RIGHT OUTER JOIN b) LEFT OUTER JOIN c) INNER JOIN d". For inner + * joins, both the left and right join subexpressions are considered for + * pushing down aliases, for the right outer join, the right subexpression + * is considered and the left ignored and for the left outer join, the + * left subexpression is considered and the left ignored. Here, aliases b + * and d are eligible to be pushed up. * + * TODO: further optimization opportunity for the case a.c1 = b.c1 and b.c2 + * = c.c2 a and b are first joined and then the result with c. But the + * second join op currently treats a and b as separate aliases and thus + * disallowing predicate expr containing both tables a and b (such as a.c3 + * + a.c4 > 20). Such predicates also can be pushed just above the second + * join and below the first join + * * @param op * Join Operator * @param rr @@ -251,41 +253,24 @@ */ private Set getQualifiedAliases(JoinOperator op, RowResolver rr) { Set aliases = new HashSet(); - int loj = Integer.MAX_VALUE; - int roj = -1; - boolean oj = false; JoinCondDesc[] conds = op.getConf().getConds(); Map> posToAliasMap = op.getPosToAliasMap(); - for (JoinCondDesc jc : conds) { - if (jc.getType() == JoinDesc.FULL_OUTER_JOIN) { - oj = true; + int i; + for (i=conds.length-1; i>=0; i--){ + if (conds[i].getType() == JoinDesc.INNER_JOIN) { + aliases.addAll(posToAliasMap.get(i+1)); + } else if (conds[i].getType() == JoinDesc.FULL_OUTER_JOIN) { break; - } else if (jc.getType() == JoinDesc.LEFT_OUTER_JOIN) { - if (jc.getLeft() < loj) { - loj = jc.getLeft(); - } - } else if (jc.getType() == JoinDesc.RIGHT_OUTER_JOIN) { - if (jc.getRight() > roj) { - roj = jc.getRight(); - } + } else if (conds[i].getType() == JoinDesc.RIGHT_OUTER_JOIN) { + aliases.addAll(posToAliasMap.get(i+1)); + break; + } else if (conds[i].getType() == JoinDesc.LEFT_OUTER_JOIN) { + continue; } } - if (oj || (loj != Integer.MAX_VALUE && roj != -1)) { - return aliases; + if(i == -1){ + aliases.addAll(posToAliasMap.get(0)); } - for (Entry> pa : posToAliasMap.entrySet()) { - if (loj != Integer.MAX_VALUE) { - if (pa.getKey() <= loj) { - aliases.addAll(pa.getValue()); - } - } else if (roj != -1) { - if (pa.getKey() >= roj) { - aliases.addAll(pa.getValue()); - } - } else { - aliases.addAll(pa.getValue()); - } - } Set aliases2 = rr.getTableNames(); aliases.retainAll(aliases2); return aliases;