Index: ql/src/test/results/clientpositive/ppd_outer_join4.q.out =================================================================== --- ql/src/test/results/clientpositive/ppd_outer_join4.q.out (revision 1163875) +++ ql/src/test/results/clientpositive/ppd_outer_join4.q.out (working copy) @@ -70,18 +70,22 @@ c TableScan alias: c - Reduce Output Operator - key expressions: - expr: key - type: string - sort order: + - Map-reduce partition columns: - expr: key - type: string - tag: 2 - value expressions: - expr: key - type: string + Filter Operator + predicate: + expr: (sqrt(key) <> 13) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 2 + value expressions: + expr: key + type: string Reduce Operator Tree: Join Operator condition map: @@ -134,7 +138,7 @@ WHERE a.key > '10' AND a.key < '20' AND b.key > '15' AND b.key < '25' AND sqrt(c.key) <> 13 PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/uc/ucuNeMAVGQGzy3459D8z2+++Z0Q/-Tmp-/amarsri/hive_2011-03-22_02-38-11_041_8830294243573092446/-mr-10000 +PREHOOK: Output: file:/var/folders/nt/ng21tg0n1jl4547lw0k8lg6hq_nw87/T/charleschen/hive_2011-08-31_17-04-03_652_7389043450109466394/-mr-10000 POSTHOOK: query: FROM src a LEFT OUTER JOIN @@ -147,7 +151,7 @@ WHERE a.key > '10' AND a.key < '20' AND b.key > '15' AND b.key < '25' AND sqrt(c.key) <> 13 POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/uc/ucuNeMAVGQGzy3459D8z2+++Z0Q/-Tmp-/amarsri/hive_2011-03-22_02-38-11_041_8830294243573092446/-mr-10000 +POSTHOOK: Output: file:/var/folders/nt/ng21tg0n1jl4547lw0k8lg6hq_nw87/T/charleschen/hive_2011-08-31_17-04-03_652_7389043450109466394/-mr-10000 150 val_150 150 val_150 150 152 val_152 152 val_152 152 152 val_152 152 val_152 152 @@ -450,18 +454,22 @@ c TableScan alias: c - Reduce Output Operator - key expressions: - expr: key - type: string - sort order: + - Map-reduce partition columns: - expr: key - type: string - tag: 2 - value expressions: - expr: key - type: string + Filter Operator + predicate: + expr: (sqrt(key) <> 13) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 2 + value expressions: + expr: key + type: string Reduce Operator Tree: Join Operator condition map: @@ -475,7 +483,7 @@ outputColumnNames: _col0, _col1, _col4, _col5, _col8 Filter Operator predicate: - expr: ((((_col4 > '15') and (_col4 < '25')) and (sqrt(_col8) <> 13)) and ((_col0 > '10') and (_col0 < '20'))) + expr: (((_col4 > '15') and (_col4 < '25')) and ((_col0 > '10') and (_col0 < '20'))) type: boolean Select Operator expressions: @@ -514,7 +522,7 @@ WHERE a.key > '10' AND a.key < '20' AND b.key > '15' AND b.key < '25' AND sqrt(c.key) <> 13 PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/uc/ucuNeMAVGQGzy3459D8z2+++Z0Q/-Tmp-/amarsri/hive_2011-03-22_02-38-20_602_416360783321217123/-mr-10000 +PREHOOK: Output: file:/var/folders/nt/ng21tg0n1jl4547lw0k8lg6hq_nw87/T/charleschen/hive_2011-08-31_17-04-10_850_6246519718607931090/-mr-10000 POSTHOOK: query: FROM src a LEFT OUTER JOIN @@ -527,7 +535,7 @@ WHERE a.key > '10' AND a.key < '20' AND b.key > '15' AND b.key < '25' AND sqrt(c.key) <> 13 POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/uc/ucuNeMAVGQGzy3459D8z2+++Z0Q/-Tmp-/amarsri/hive_2011-03-22_02-38-20_602_416360783321217123/-mr-10000 +POSTHOOK: Output: file:/var/folders/nt/ng21tg0n1jl4547lw0k8lg6hq_nw87/T/charleschen/hive_2011-08-31_17-04-10_850_6246519718607931090/-mr-10000 150 val_150 150 val_150 150 152 val_152 152 val_152 152 152 val_152 152 val_152 152 Index: ql/src/test/results/clientpositive/ppd_outer_join5.q.out =================================================================== --- ql/src/test/results/clientpositive/ppd_outer_join5.q.out (revision 0) +++ ql/src/test/results/clientpositive/ppd_outer_join5.q.out (revision 0) @@ -0,0 +1,412 @@ +PREHOOK: query: create table t1 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t1 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t1 +PREHOOK: query: create table t2 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t2 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t2 +PREHOOK: query: create table t3 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t3 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t3 +PREHOOK: query: create table t4 (id int, key string, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table t4 (id int, key string, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@t4 +PREHOOK: query: explain select * from t1 full outer join t2 on t1.id=t2.id join t3 on t2.id=t3.id where t3.id=20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 full outer join t2 on t1.id=t2.id join t3 on t2.id=t3.id where t3.id=20 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_FULLOUTERJOIN (TOK_TABREF (TOK_TABNAME t1)) (TOK_TABREF (TOK_TABNAME t2)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t2) id))) (TOK_TABREF (TOK_TABNAME t3)) (= (. (TOK_TABLE_OR_COL t2) id) (. (TOK_TABLE_OR_COL t3) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL t3) id) 20)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 0 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t2 + TableScan + alias: t2 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 1 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t3 + TableScan + alias: t3 + Filter Operator + predicate: + expr: (id = 20) + type: boolean + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 2 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + Inner Join 1 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 2 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col10, _col11, _col12 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col5 + type: int + expr: _col6 + type: string + expr: _col7 + type: string + expr: _col10 + type: int + expr: _col11 + type: string + expr: _col12 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t2.id=t3.id) where t2.id=20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t2.id=t3.id) where t2.id=20 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME t1)) (TOK_TABREF (TOK_TABNAME t2)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t2) id))) (TOK_TABREF (TOK_TABNAME t3)) (= (. (TOK_TABLE_OR_COL t2) id) (. (TOK_TABLE_OR_COL t3) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL t2) id) 20)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 0 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t2 + TableScan + alias: t2 + Filter Operator + predicate: + expr: (id = 20) + type: boolean + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 1 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t3 + TableScan + alias: t3 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 2 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Left Outer Join1 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 2 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col10, _col11, _col12 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col5 + type: int + expr: _col6 + type: string + expr: _col7 + type: string + expr: _col10 + type: int + expr: _col11 + type: string + expr: _col12 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t1.id=t3.id) where t2.id=20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t1.id=t3.id) where t2.id=20 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME t1)) (TOK_TABREF (TOK_TABNAME t2)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t2) id))) (TOK_TABREF (TOK_TABNAME t3)) (= (. (TOK_TABLE_OR_COL t1) id) (. (TOK_TABLE_OR_COL t3) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL t2) id) 20)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 0 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t2 + TableScan + alias: t2 + Filter Operator + predicate: + expr: (id = 20) + type: boolean + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 1 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + t3 + TableScan + alias: t3 + Reduce Output Operator + key expressions: + expr: id + type: int + sort order: + + Map-reduce partition columns: + expr: id + type: int + tag: 2 + value expressions: + expr: id + type: int + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Left Outer Join0 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 1 {VALUE._col0} {VALUE._col1} {VALUE._col2} + 2 {VALUE._col0} {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col10, _col11, _col12 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col5 + type: int + expr: _col6 + type: string + expr: _col7 + type: string + expr: _col10 + type: int + expr: _col11 + type: string + expr: _col12 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: drop table t1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: drop table t1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: drop table t2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2 +PREHOOK: Output: default@t2 +POSTHOOK: query: drop table t2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@t2 +PREHOOK: query: drop table t3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t3 +PREHOOK: Output: default@t3 +POSTHOOK: query: drop table t3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t3 +POSTHOOK: Output: default@t3 +PREHOOK: query: drop table t4 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t4 +PREHOOK: Output: default@t4 +POSTHOOK: query: drop table t4 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t4 +POSTHOOK: Output: default@t4 Index: ql/src/test/queries/clientpositive/ppd_outer_join5.q =================================================================== --- ql/src/test/queries/clientpositive/ppd_outer_join5.q (revision 0) +++ ql/src/test/queries/clientpositive/ppd_outer_join5.q (revision 0) @@ -0,0 +1,16 @@ +set hive.optimize.ppd=true; +set hive.ppd.remove.duplicatefilters=true; + +create table t1 (id int, key string, value string); +create table t2 (id int, key string, value string); +create table t3 (id int, key string, value string); +create table t4 (id int, key string, value string); + +explain select * from t1 full outer join t2 on t1.id=t2.id join t3 on t2.id=t3.id where t3.id=20; +explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t2.id=t3.id) where t2.id=20; +explain select * from t1 join t2 on (t1.id=t2.id) left outer join t3 on (t1.id=t3.id) where t2.id=20; + +drop table t1; +drop table t2; +drop table t3; +drop table t4; \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (revision 1163875) +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (working copy) @@ -270,19 +270,22 @@ /** * Figures out the aliases for whom it is safe to push predicates based on - * ANSI SQL semantics For inner join, all predicates for all aliases can be - * pushed For full outer join, none of the predicates can be pushed as that - * would limit the number of rows for join For left outer join, all the - * predicates on the left side aliases can be pushed up For right outer - * join, all the predicates on the right side aliases can be pushed up Joins - * chain containing both left and right outer joins are treated as full - * outer join. TODO: further optimization opportunity for the case a.c1 = - * b.c1 and b.c2 = c.c2 a and b are first joined and then the result with c. - * But the second join op currently treats a and b as separate aliases and - * thus disallowing predicate expr containing both tables a and b (such as - * a.c3 + a.c4 > 20). Such predicates also can be pushed just above the - * second join and below the first join + * ANSI SQL semantics. The join conditions are left associative so "a + * RIGHT OUTER JOIN b LEFT OUTER JOIN c INNER JOIN d" is interpreted as + * "((a RIGHT OUTER JOIN b) LEFT OUTER JOIN c) INNER JOIN d". For inner + * joins, both the left and right join subexpressions are considered for + * pushing down aliases, for the right outer join, the right subexpression + * is considered and the left ignored and for the left outer join, the + * left subexpression is considered and the left ignored. Here, aliases b + * and d are eligible to be pushed up. * + * TODO: further optimization opportunity for the case a.c1 = b.c1 and b.c2 + * = c.c2 a and b are first joined and then the result with c. But the + * second join op currently treats a and b as separate aliases and thus + * disallowing predicate expr containing both tables a and b (such as a.c3 + * + a.c4 > 20). Such predicates also can be pushed just above the second + * join and below the first join + * * @param op * Join Operator * @param rr @@ -291,41 +294,24 @@ */ private Set getQualifiedAliases(JoinOperator op, RowResolver rr) { Set aliases = new HashSet(); - int loj = Integer.MAX_VALUE; - int roj = -1; - boolean oj = false; JoinCondDesc[] conds = op.getConf().getConds(); Map> posToAliasMap = op.getPosToAliasMap(); - for (JoinCondDesc jc : conds) { - if (jc.getType() == JoinDesc.FULL_OUTER_JOIN) { - oj = true; + int i; + for (i=conds.length-1; i>=0; i--){ + if (conds[i].getType() == JoinDesc.INNER_JOIN) { + aliases.addAll(posToAliasMap.get(i+1)); + } else if (conds[i].getType() == JoinDesc.FULL_OUTER_JOIN) { break; - } else if (jc.getType() == JoinDesc.LEFT_OUTER_JOIN) { - if (jc.getLeft() < loj) { - loj = jc.getLeft(); - } - } else if (jc.getType() == JoinDesc.RIGHT_OUTER_JOIN) { - if (jc.getRight() > roj) { - roj = jc.getRight(); - } + } else if (conds[i].getType() == JoinDesc.RIGHT_OUTER_JOIN) { + aliases.addAll(posToAliasMap.get(i+1)); + break; + } else if (conds[i].getType() == JoinDesc.LEFT_OUTER_JOIN) { + continue; } } - if (oj || (loj != Integer.MAX_VALUE && roj != -1)) { - return aliases; + if(i == -1){ + aliases.addAll(posToAliasMap.get(0)); } - for (Entry> pa : posToAliasMap.entrySet()) { - if (loj != Integer.MAX_VALUE) { - if (pa.getKey() <= loj) { - aliases.addAll(pa.getValue()); - } - } else if (roj != -1) { - if (pa.getKey() >= roj) { - aliases.addAll(pa.getValue()); - } - } else { - aliases.addAll(pa.getValue()); - } - } Set aliases2 = rr.getTableNames(); aliases.retainAll(aliases2); return aliases;