diff --git ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java index e7db370..7c8f8d7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java @@ -77,9 +77,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFRank.GenericUDAFRankEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; -import org.apache.hadoop.hive.ql.udf.ptf.WindowingTableFunction; import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.mapred.JobConf; @@ -196,7 +194,7 @@ private void pushRankLimit(PTFOperator ptfOp, OpWalkerInfo owi) throws SemanticE return; } - ExprWalkerInfo childInfo = getChildWalkerInfo((Operator) ptfOp, owi); + ExprWalkerInfo childInfo = getChildWalkerInfo(ptfOp, owi); if (childInfo == null) { return; @@ -411,16 +409,18 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { LOG.info("Processing for " + nd.getName() + "(" + ((Operator) nd).getIdentifier() + ")"); + OpWalkerInfo owi = (OpWalkerInfo) procCtx; - Operator op = - (Operator) nd; - ExprNodeDesc predicate = (((FilterOperator) nd).getConf()).getPredicate(); - ExprWalkerInfo ewi = new ExprWalkerInfo(); + Operator op = (Operator) nd; + + // if this filter is generated one, predicates need not to be extracted + ExprWalkerInfo ewi = owi.getPrunedPreds(op); // Don't push a sampling predicate since createFilter() always creates filter // with isSamplePred = false. Also, the filterop with sampling pred is always // a child of TableScan, so there is no need to push this predicate. - if (!((FilterOperator)op).getConf().getIsSamplingPred()) { + if (ewi == null && !((FilterOperator)op).getConf().getIsSamplingPred()) { // get pushdown predicates for this operator's predicate + ExprNodeDesc predicate = (((FilterOperator) nd).getConf()).getPredicate(); ewi = ExprWalkerProcFactory.extractPushdownPreds(owi, op, predicate); if (!ewi.isDeterministic()) { /* predicate is not deterministic */ @@ -964,6 +964,12 @@ protected static Object createFilter(Operator op, } owi.getCandidateFilterOps().clear(); } + // push down current ppd context to newly added filter + ExprWalkerInfo walkerInfo = owi.getPrunedPreds(op); + if (walkerInfo != null) { + walkerInfo.getNonFinalCandidates().clear(); + owi.putPrunedPreds(output, walkerInfo); + } return output; } @@ -1048,7 +1054,7 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( tableScanDesc.setFilterExpr(decomposed.pushedPredicate); tableScanDesc.setFilterObject(decomposed.pushedPredicateObject); - return (ExprNodeGenericFuncDesc)decomposed.residualPredicate; + return decomposed.residualPredicate; } public static NodeProcessor getFilterProc() { diff --git ql/src/test/queries/clientpositive/ppd_join5.q ql/src/test/queries/clientpositive/ppd_join5.q new file mode 100644 index 0000000..e1675d3 --- /dev/null +++ ql/src/test/queries/clientpositive/ppd_join5.q @@ -0,0 +1,24 @@ +create table t1 (id1 string, id2 string); +create table t2 (id string, d int); + +from src tablesample (1 rows) + insert into table t1 select 'a','a' + insert into table t2 select 'a',2; + +explain +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1; + +explain +select * from ( +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +) z where d1 > 1 or d2 > 1; + +select * from ( +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +) z where d1 > 1 or d2 > 1; diff --git ql/src/test/results/clientpositive/ppd_join5.q.out ql/src/test/results/clientpositive/ppd_join5.q.out new file mode 100644 index 0000000..1559ad8 --- /dev/null +++ ql/src/test/results/clientpositive/ppd_join5.q.out @@ -0,0 +1,266 @@ +PREHOOK: query: create table t1 (id1 string, id2 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table t1 (id1 string, id2 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: create table t2 (id string, d int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table t2 (id string, d int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2 +PREHOOK: query: from src tablesample (1 rows) + insert into table t1 select 'a','a' + insert into table t2 select 'a',2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@t1 +PREHOOK: Output: default@t2 +POSTHOOK: query: from src tablesample (1 rows) + insert into table t1 select 'a','a' + insert into table t2 select 'a',2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@t1 +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t1.id1 SIMPLE [] +POSTHOOK: Lineage: t1.id2 SIMPLE [] +POSTHOOK: Lineage: t2.d SIMPLE [] +POSTHOOK: Lineage: t2.id SIMPLE [] +Warning: Shuffle Join JOIN[10][tables = [a, b, c]] in Stage 'Stage-2:MAPRED' is a cross product +PREHOOK: query: explain +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id is not null and (d <= 1)) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: id (type: string), id (type: string) + sort order: ++ + Map-reduce partition columns: id (type: string), id (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: d (type: int) + TableScan + alias: a + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id1 is not null and id2 is not null) (type: boolean) + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: id1 (type: string), id2 (type: string) + sort order: ++ + Map-reduce partition columns: id1 (type: string), id2 (type: string) + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} + 1 {VALUE._col0} + outputColumnNames: _col0, _col1, _col5 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col5 (type: int) + TableScan + alias: c + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (d <= 1) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: d (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col5} + 1 {VALUE._col1} + outputColumnNames: _col0, _col1, _col5, _col9 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: int), _col9 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[10][tables = [a, b, c]] in Stage 'Stage-2:MAPRED' is a cross product +PREHOOK: query: explain +select * from ( +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +) z where d1 > 1 or d2 > 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from ( +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +) z where d1 > 1 or d2 > 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id1 is not null and id2 is not null) (type: boolean) + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: id1 (type: string), id2 (type: string) + sort order: ++ + Map-reduce partition columns: id1 (type: string), id2 (type: string) + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + TableScan + alias: b + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id is not null and (d <= 1)) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: id (type: string), id (type: string) + sort order: ++ + Map-reduce partition columns: id (type: string), id (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: d (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} + 1 {VALUE._col0} + outputColumnNames: _col0, _col1, _col5 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col5 (type: int) + TableScan + alias: c + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (d <= 1) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: d (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col5} + 1 {VALUE._col1} + outputColumnNames: _col0, _col1, _col5, _col9 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((_col5 > 1) or (_col9 > 1)) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: int), _col9 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[10][tables = [a, b, c]] in Stage 'Stage-2:MAPRED' is a cross product +PREHOOK: query: select * from ( +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +) z where d1 > 1 or d2 > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from ( +select a.*,b.d d1,c.d d2 from + t1 a join t2 b on (a.id1 = b.id) + join t2 c on (a.id2 = b.id) where b.d <= 1 and c.d <= 1 +) z where d1 > 1 or d2 > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here ####