diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java index cd7fb92..00c837d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java @@ -136,7 +136,7 @@ public boolean acceptLimitPushdown() { * * @return if it is an identity select operator or not */ - public boolean isIdentitySelect() { + public boolean isIdentitySelect(boolean checkIdentical) { // Safety check if(this.getNumParent() != 1) { return false; @@ -175,8 +175,14 @@ public boolean isIdentitySelect() { return false; } - if(!origColumn.internalEquals(destColumn)) { - return false; + if (checkIdentical) { + if(!origColumn.equals(destColumn)) { + return false; + } + } else { + if(!origColumn.internalEquals(destColumn)) { + return false; + } } // Now we check if though the schemas are the same, diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java index e3d3ce6..ef45c75 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java @@ -24,9 +24,6 @@ import java.util.Map; import java.util.Stack; -import com.google.common.base.Predicates; -import com.google.common.collect.Iterators; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -46,6 +43,9 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import com.google.common.base.Predicates; +import com.google.common.collect.Iterators; + /** This optimization tries to remove {@link SelectOperator} from tree which don't do any * processing except forwarding columns from its parent to its children. * e.g., select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; @@ -107,13 +107,25 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } Operator parent = parents.get(0); if (parent instanceof ReduceSinkOperator && Iterators.any(sel.getChildOperators().iterator(), - Predicates.instanceOf(ReduceSinkOperator.class))) { - // For RS-SEL-RS case. reducer operator in reducer task cannot be null in task compiler + Predicates.instanceOf(ReduceSinkOperator.class))) { + // For RS-SEL-RS case. reducer operator in reducer task cannot be null + // in task compiler return null; } - if(sel.isIdentitySelect()) { - parent.removeChildAndAdoptItsChildren(sel); + if (Iterators.any(sel.getChildOperators().iterator(), + Predicates.instanceOf(ReduceSinkOperator.class))) { + // RS data structures rely on table aliases. If a child is a RS, we need + // to check whether the schema of the operators is exactly the same in + // order to be able to remove the SEL operator + if (sel.isIdentitySelect(true)) { + LOG.debug("Identity project remover optimization removed : " + sel); + parent.removeChildAndAdoptItsChildren(sel); + } + return null; + } + if(sel.isIdentitySelect(false)) { LOG.debug("Identity project remover optimization removed : " + sel); + parent.removeChildAndAdoptItsChildren(sel); } return null; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index bc8d8f7..e54b3f9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -237,7 +237,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Operator last = (Operator) stack.get(5); if (last instanceof SelectOperator) { SelectOperator cselOp = (SelectOperator) last; - if (!cselOp.isIdentitySelect()) { + if (!cselOp.isIdentitySelect(false)) { return null; // todo we can do further by providing operator to fetch task } last = (Operator) stack.get(6); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 58ee605..f661abb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -9080,7 +9080,7 @@ private Operator genUnionPlan(String unionalias, String leftalias, leftOp.getParentOperators() != null && !leftOp.getParentOperators().isEmpty() && leftOp.getParentOperators().get(0) instanceof UnionOperator && - ((SelectOperator)leftOp).isIdentitySelect()) ) { + ((SelectOperator)leftOp).isIdentitySelect(false)) ) { if(!(leftOp instanceof UnionOperator)) { Operator oldChild = leftOp; @@ -9108,7 +9108,7 @@ private Operator genUnionPlan(String unionalias, String leftalias, rightOp.getParentOperators() != null && !rightOp.getParentOperators().isEmpty() && rightOp.getParentOperators().get(0) instanceof UnionOperator && - ((SelectOperator)rightOp).isIdentitySelect()) ) { + ((SelectOperator)rightOp).isIdentitySelect(false)) ) { if(!(rightOp instanceof UnionOperator)) { Operator oldChild = rightOp; diff --git ql/src/test/queries/clientpositive/join43.q ql/src/test/queries/clientpositive/join43.q new file mode 100644 index 0000000..045882a --- /dev/null +++ ql/src/test/queries/clientpositive/join43.q @@ -0,0 +1,84 @@ +create table purchase_history (s string, product string, price double, time int); +insert into purchase_history values ('1', 'Belt', 20.00, 21); +insert into purchase_history values ('1', 'Socks', 3.50, 31); +insert into purchase_history values ('3', 'Belt', 20.00, 51); +insert into purchase_history values ('4', 'Shirt', 15.50, 59); + +create table cart_history (s string, cart_id int, time int); +insert into cart_history values ('1', 1, 10); +insert into cart_history values ('1', 2, 20); +insert into cart_history values ('1', 3, 30); +insert into cart_history values ('1', 4, 40); +insert into cart_history values ('3', 5, 50); +insert into cart_history values ('4', 6, 60); + +create table events (s string, st2 string, n int, time int); +insert into events values ('1', 'Bob', 1234, 20); +insert into events values ('1', 'Bob', 1234, 30); +insert into events values ('1', 'Bob', 1234, 25); +insert into events values ('2', 'Sam', 1234, 30); +insert into events values ('3', 'Jeff', 1234, 50); +insert into events values ('4', 'Ted', 1234, 60); + +--[HIVE-10996] Aggregation / Projection over Multi-Join Inner Query producing incorrect results +explain +select s +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list; + +select s +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list; + +explain +select * +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list; + +select * +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list; diff --git ql/src/test/results/clientpositive/join43.q.out ql/src/test/results/clientpositive/join43.q.out new file mode 100644 index 0000000..39904e0 --- /dev/null +++ ql/src/test/results/clientpositive/join43.q.out @@ -0,0 +1,654 @@ +PREHOOK: query: create table purchase_history (s string, product string, price double, time int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@purchase_history +POSTHOOK: query: create table purchase_history (s string, product string, price double, time int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@purchase_history +PREHOOK: query: insert into purchase_history values ('1', 'Belt', 20.00, 21) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@purchase_history +POSTHOOK: query: insert into purchase_history values ('1', 'Belt', 20.00, 21) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@purchase_history +POSTHOOK: Lineage: purchase_history.price EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.product SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.s SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.time EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into purchase_history values ('1', 'Socks', 3.50, 31) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@purchase_history +POSTHOOK: query: insert into purchase_history values ('1', 'Socks', 3.50, 31) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@purchase_history +POSTHOOK: Lineage: purchase_history.price EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.product SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.s SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.time EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into purchase_history values ('3', 'Belt', 20.00, 51) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@purchase_history +POSTHOOK: query: insert into purchase_history values ('3', 'Belt', 20.00, 51) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@purchase_history +POSTHOOK: Lineage: purchase_history.price EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.product SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.s SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.time EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into purchase_history values ('4', 'Shirt', 15.50, 59) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@purchase_history +POSTHOOK: query: insert into purchase_history values ('4', 'Shirt', 15.50, 59) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@purchase_history +POSTHOOK: Lineage: purchase_history.price EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.product SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.s SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: purchase_history.time EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: create table cart_history (s string, cart_id int, time int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@cart_history +POSTHOOK: query: create table cart_history (s string, cart_id int, time int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@cart_history +PREHOOK: query: insert into cart_history values ('1', 1, 10) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__5 +PREHOOK: Output: default@cart_history +POSTHOOK: query: insert into cart_history values ('1', 1, 10) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__5 +POSTHOOK: Output: default@cart_history +POSTHOOK: Lineage: cart_history.cart_id EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.s SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.time EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +PREHOOK: query: insert into cart_history values ('1', 2, 20) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__6 +PREHOOK: Output: default@cart_history +POSTHOOK: query: insert into cart_history values ('1', 2, 20) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__6 +POSTHOOK: Output: default@cart_history +POSTHOOK: Lineage: cart_history.cart_id EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.s SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.time EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +PREHOOK: query: insert into cart_history values ('1', 3, 30) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__7 +PREHOOK: Output: default@cart_history +POSTHOOK: query: insert into cart_history values ('1', 3, 30) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__7 +POSTHOOK: Output: default@cart_history +POSTHOOK: Lineage: cart_history.cart_id EXPRESSION [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.s SIMPLE [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.time EXPRESSION [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +PREHOOK: query: insert into cart_history values ('1', 4, 40) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__8 +PREHOOK: Output: default@cart_history +POSTHOOK: query: insert into cart_history values ('1', 4, 40) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__8 +POSTHOOK: Output: default@cart_history +POSTHOOK: Lineage: cart_history.cart_id EXPRESSION [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.s SIMPLE [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.time EXPRESSION [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +PREHOOK: query: insert into cart_history values ('3', 5, 50) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__9 +PREHOOK: Output: default@cart_history +POSTHOOK: query: insert into cart_history values ('3', 5, 50) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__9 +POSTHOOK: Output: default@cart_history +POSTHOOK: Lineage: cart_history.cart_id EXPRESSION [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.s SIMPLE [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.time EXPRESSION [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +PREHOOK: query: insert into cart_history values ('4', 6, 60) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__10 +PREHOOK: Output: default@cart_history +POSTHOOK: query: insert into cart_history values ('4', 6, 60) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__10 +POSTHOOK: Output: default@cart_history +POSTHOOK: Lineage: cart_history.cart_id EXPRESSION [(values__tmp__table__10)values__tmp__table__10.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.s SIMPLE [(values__tmp__table__10)values__tmp__table__10.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cart_history.time EXPRESSION [(values__tmp__table__10)values__tmp__table__10.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +PREHOOK: query: create table events (s string, st2 string, n int, time int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@events +POSTHOOK: query: create table events (s string, st2 string, n int, time int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@events +PREHOOK: query: insert into events values ('1', 'Bob', 1234, 20) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__11 +PREHOOK: Output: default@events +POSTHOOK: query: insert into events values ('1', 'Bob', 1234, 20) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__11 +POSTHOOK: Output: default@events +POSTHOOK: Lineage: events.n EXPRESSION [(values__tmp__table__11)values__tmp__table__11.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: events.s SIMPLE [(values__tmp__table__11)values__tmp__table__11.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: events.st2 SIMPLE [(values__tmp__table__11)values__tmp__table__11.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: events.time EXPRESSION [(values__tmp__table__11)values__tmp__table__11.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into events values ('1', 'Bob', 1234, 30) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__12 +PREHOOK: Output: default@events +POSTHOOK: query: insert into events values ('1', 'Bob', 1234, 30) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__12 +POSTHOOK: Output: default@events +POSTHOOK: Lineage: events.n EXPRESSION [(values__tmp__table__12)values__tmp__table__12.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: events.s SIMPLE [(values__tmp__table__12)values__tmp__table__12.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: events.st2 SIMPLE [(values__tmp__table__12)values__tmp__table__12.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: events.time EXPRESSION [(values__tmp__table__12)values__tmp__table__12.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into events values ('1', 'Bob', 1234, 25) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__13 +PREHOOK: Output: default@events +POSTHOOK: query: insert into events values ('1', 'Bob', 1234, 25) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__13 +POSTHOOK: Output: default@events +POSTHOOK: Lineage: events.n EXPRESSION [(values__tmp__table__13)values__tmp__table__13.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: events.s SIMPLE [(values__tmp__table__13)values__tmp__table__13.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: events.st2 SIMPLE [(values__tmp__table__13)values__tmp__table__13.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: events.time EXPRESSION [(values__tmp__table__13)values__tmp__table__13.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into events values ('2', 'Sam', 1234, 30) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__14 +PREHOOK: Output: default@events +POSTHOOK: query: insert into events values ('2', 'Sam', 1234, 30) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__14 +POSTHOOK: Output: default@events +POSTHOOK: Lineage: events.n EXPRESSION [(values__tmp__table__14)values__tmp__table__14.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: events.s SIMPLE [(values__tmp__table__14)values__tmp__table__14.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: events.st2 SIMPLE [(values__tmp__table__14)values__tmp__table__14.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: events.time EXPRESSION [(values__tmp__table__14)values__tmp__table__14.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into events values ('3', 'Jeff', 1234, 50) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__15 +PREHOOK: Output: default@events +POSTHOOK: query: insert into events values ('3', 'Jeff', 1234, 50) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__15 +POSTHOOK: Output: default@events +POSTHOOK: Lineage: events.n EXPRESSION [(values__tmp__table__15)values__tmp__table__15.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: events.s SIMPLE [(values__tmp__table__15)values__tmp__table__15.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: events.st2 SIMPLE [(values__tmp__table__15)values__tmp__table__15.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: events.time EXPRESSION [(values__tmp__table__15)values__tmp__table__15.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: insert into events values ('4', 'Ted', 1234, 60) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__16 +PREHOOK: Output: default@events +POSTHOOK: query: insert into events values ('4', 'Ted', 1234, 60) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__16 +POSTHOOK: Output: default@events +POSTHOOK: Lineage: events.n EXPRESSION [(values__tmp__table__16)values__tmp__table__16.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: events.s SIMPLE [(values__tmp__table__16)values__tmp__table__16.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: events.st2 SIMPLE [(values__tmp__table__16)values__tmp__table__16.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: events.time EXPRESSION [(values__tmp__table__16)values__tmp__table__16.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: --[HIVE-10996] Aggregation / Projection over Multi-Join Inner Query producing incorrect results +explain +select s +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +PREHOOK: type: QUERY +POSTHOOK: query: --[HIVE-10996] Aggregation / Projection over Multi-Join Inner Query producing incorrect results +explain +select s +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: purchase_history + Statistics: Num rows: 4 Data size: 57 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: s is not null (type: boolean) + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s (type: string), time (type: int) + outputColumnNames: _col0, _col3 + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + value expressions: _col3 (type: int) + TableScan + alias: cart_history + Statistics: Num rows: 6 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: s is not null (type: boolean) + Statistics: Num rows: 3 Data size: 18 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s (type: string), time (type: int) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 3 Data size: 18 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 18 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col3, _col6 + Statistics: Num rows: 3 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (_col3 > _col6) (type: boolean) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col6) + keys: _col0 (type: string), _col3 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: int) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: string), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: _col2 is not null (type: boolean) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: int) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col2 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col2 (type: int) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + TableScan + alias: events + Statistics: Num rows: 6 Data size: 79 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (s is not null and time is not null) (type: boolean) + Statistics: Num rows: 2 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s (type: string), time (type: int) + outputColumnNames: _col0, _col3 + Statistics: Num rows: 2 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col3 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col3 (type: int) + Statistics: Num rows: 2 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col2 (type: int) + 1 _col0 (type: string), _col3 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select s +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +PREHOOK: type: QUERY +PREHOOK: Input: default@cart_history +PREHOOK: Input: default@events +PREHOOK: Input: default@purchase_history +#### A masked pattern was here #### +POSTHOOK: query: select s +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cart_history +POSTHOOK: Input: default@events +POSTHOOK: Input: default@purchase_history +#### A masked pattern was here #### +1 +1 +3 +PREHOOK: query: explain +select * +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: purchase_history + Statistics: Num rows: 4 Data size: 57 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: s is not null (type: boolean) + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s (type: string), time (type: int) + outputColumnNames: _col0, _col3 + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + value expressions: _col3 (type: int) + TableScan + alias: cart_history + Statistics: Num rows: 6 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: s is not null (type: boolean) + Statistics: Num rows: 3 Data size: 18 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s (type: string), time (type: int) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 3 Data size: 18 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 18 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col3, _col6 + Statistics: Num rows: 3 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (_col3 > _col6) (type: boolean) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col6) + keys: _col0 (type: string), _col3 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: int) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: string), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: _col2 is not null (type: boolean) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col2 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col2 (type: int) + Statistics: Num rows: 1 Data size: 6 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: events + Statistics: Num rows: 6 Data size: 79 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (s is not null and time is not null) (type: boolean) + Statistics: Num rows: 2 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: s (type: string), st2 (type: string), n (type: int), time (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col3 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col3 (type: int) + Statistics: Num rows: 2 Data size: 26 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col2 (type: int) + 1 _col0 (type: string), _col3 (type: int) + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col4 (type: string), _col5 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 28 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +PREHOOK: type: QUERY +PREHOOK: Input: default@cart_history +PREHOOK: Input: default@events +PREHOOK: Input: default@purchase_history +#### A masked pattern was here #### +POSTHOOK: query: select * +from ( + select last.*, action.st2, action.n + from ( + select purchase.s, purchase.time, max (mevt.time) as last_stage_time + from (select * from purchase_history) purchase + join (select * from cart_history) mevt + on purchase.s = mevt.s + where purchase.time > mevt.time + group by purchase.s, purchase.time + ) last + join (select * from events) action + on last.s = action.s and last.last_stage_time = action.time +) list +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cart_history +POSTHOOK: Input: default@events +POSTHOOK: Input: default@purchase_history +#### A masked pattern was here #### +1 21 20 Bob 1234 +1 31 30 Bob 1234 +3 51 50 Jeff 1234