Index: ql/src/java/org/apache/hadoop/hive/ql/ppd/ExprWalkerProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ppd/ExprWalkerProcFactory.java (revision 1153812) +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/ExprWalkerProcFactory.java (working copy) @@ -51,7 +51,7 @@ * pushdown optimization for the given operator */ public final class ExprWalkerProcFactory { - + private static final Log LOG = LogFactory .getLog(ExprWalkerProcFactory.class.getName()); @@ -73,6 +73,7 @@ Operator op = ctx.getOp(); String[] colAlias = toRR.reverseLookup(colref.getColumn()); + boolean isCandidate = true; if (op.getColumnExprMap() != null) { // replace the output expression with the input expression so that // parent op can understand this expression @@ -82,9 +83,19 @@ // group by ctx.setIsCandidate(colref, false); return false; + } else { + if (exp instanceof ExprNodeGenericFuncDesc) { + GraphWalker egw = createExprWalker(ctx); + List preds = new ArrayList(); + preds.add(exp); + List clonedPreds = new ArrayList(); + ExprNodeDesc clonedExpr = exp.clone(); + clonedPreds.add(clonedExpr); + walkExprWalker(egw, preds, clonedPreds); + isCandidate = ctx.isCandidate(clonedExpr); + } } ctx.addConvertedNode(colref, exp); - ctx.setIsCandidate(exp, true); ctx.addAlias(exp, colAlias[0]); } else { if (colAlias == null) { @@ -92,8 +103,8 @@ } ctx.addAlias(colref, colAlias[0]); } - ctx.setIsCandidate(colref, true); - return true; + ctx.setIsCandidate(colref, isCandidate); + return isCandidate; } } @@ -234,7 +245,7 @@ /** * Extracts pushdown predicates from the given list of predicate expression. - * + * * @param opContext * operator context used for resolving column references * @param op @@ -250,6 +261,23 @@ ExprWalkerInfo exprContext = new ExprWalkerInfo(op, opContext .getRowResolver(op)); + GraphWalker egw =createExprWalker(exprContext); + List clonedPreds = new ArrayList(); + for (ExprNodeDesc node : preds) { + clonedPreds.add(node.clone()); + } + + walkExprWalker(egw, preds, clonedPreds); + + HiveConf conf = opContext.getParseContext().getConf(); + // check the root expression for final candidates + for (ExprNodeDesc pred : clonedPreds) { + extractFinalCandidates(pred, exprContext, conf); + } + return exprContext; + } + + private static GraphWalker createExprWalker(ExprWalkerInfo exprContext) { // create a walker which walks the tree in a DFS manner while maintaining // the operator stack. The dispatcher // generates the plan from the operator tree @@ -267,25 +295,16 @@ // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(getDefaultExprProcessor(), exprRules, exprContext); - GraphWalker egw = new DefaultGraphWalker(disp); + return new DefaultGraphWalker(disp); + } + private static void walkExprWalker(GraphWalker egw, List preds, List clonedPreds) + throws SemanticException { List startNodes = new ArrayList(); - List clonedPreds = new ArrayList(); - for (ExprNodeDesc node : preds) { - clonedPreds.add(node.clone()); - } startNodes.addAll(clonedPreds); egw.startWalking(startNodes, null); - - HiveConf conf = opContext.getParseContext().getConf(); - // check the root expression for final candidates - for (ExprNodeDesc pred : clonedPreds) { - extractFinalCandidates(pred, exprContext, conf); - } - return exprContext; } - /** * Walks through the top AND nodes and determine which of them are final * candidates. Index: ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (revision 1153812) +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java (working copy) @@ -43,7 +43,6 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; -import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.RowResolver; @@ -197,8 +196,14 @@ owi.putPrunedPreds((Operator) nd, ewi); } // merge it with children predicates - mergeWithChildrenPred(op, owi, ewi, null, false); - + boolean hasUnpushedPredicates = mergeWithChildrenPred(nd, owi, ewi, null, false); + if (HiveConf.getBoolVar(owi.getParseContext().getConf(), + HiveConf.ConfVars.HIVEPPDREMOVEDUPLICATEFILTERS)) { + if (hasUnpushedPredicates) { + ExprWalkerInfo unpushedPreds = mergeChildrenPred(nd, owi, null, false); + return createFilter((Operator)nd, unpushedPreds, owi); + } + } return null; } } Index: ql/src/test/queries/clientpositive/ppd_udf_col.q =================================================================== --- ql/src/test/queries/clientpositive/ppd_udf_col.q (revision 0) +++ ql/src/test/queries/clientpositive/ppd_udf_col.q (revision 0) @@ -0,0 +1,34 @@ +set hive.optimize.ppd=true; +set hive.ppd.remove.duplicatefilters=false; + +EXPLAIN +SELECT key, randum123 +FROM (SELECT *, cast(rand() as double) AS randum123 FROM src WHERE key = 100) a +WHERE randum123 <=0.1; + +EXPLAIN +SELECT key,randum123, h4 +FROM (SELECT *, cast(rand() as double) AS randum123, hex(4) AS h4 FROM src WHERE key = 100) a +WHERE a.h4 <= 3; + +EXPLAIN +SELECT key,randum123, v10 +FROM (SELECT *, cast(rand() as double) AS randum123, value*10 AS v10 FROM src WHERE key = 100) a +WHERE a.v10 <= 200; + +set hive.ppd.remove.duplicatefilters=true; + +EXPLAIN +SELECT key, randum123 +FROM (SELECT *, cast(rand() as double) AS randum123 FROM src WHERE key = 100) a +WHERE randum123 <=0.1; + +EXPLAIN +SELECT key,randum123, h4 +FROM (SELECT *, cast(rand() as double) AS randum123, hex(4) AS h4 FROM src WHERE key = 100) a +WHERE a.h4 <= 3; + +EXPLAIN +SELECT key,randum123, v10 +FROM (SELECT *, cast(rand() as double) AS randum123, value*10 AS v10 FROM src WHERE key = 100) a +WHERE a.v10 <= 200; Index: ql/src/test/results/clientpositive/ppd_udf_col.q.out =================================================================== --- ql/src/test/results/clientpositive/ppd_udf_col.q.out (revision 0) +++ ql/src/test/results/clientpositive/ppd_udf_col.q.out (revision 0) @@ -0,0 +1,376 @@ +PREHOOK: query: EXPLAIN +SELECT key, randum123 +FROM (SELECT *, cast(rand() as double) AS randum123 FROM src WHERE key = 100) a +WHERE randum123 <=0.1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key, randum123 +FROM (SELECT *, cast(rand() as double) AS randum123 FROM src WHERE key = 100) a +WHERE randum123 <=0.1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF) (TOK_SELEXPR (TOK_FUNCTION TOK_DOUBLE (TOK_FUNCTION rand)) randum123)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 100)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL randum123))) (TOK_WHERE (<= (TOK_TABLE_OR_COL randum123) 0.1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: rand() + type: double + outputColumnNames: _col0, _col2 + Filter Operator + predicate: + expr: (_col2 <= 0.1) + type: boolean + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: double + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN +SELECT key,randum123, h4 +FROM (SELECT *, cast(rand() as double) AS randum123, hex(4) AS h4 FROM src WHERE key = 100) a +WHERE a.h4 <= 3 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key,randum123, h4 +FROM (SELECT *, cast(rand() as double) AS randum123, hex(4) AS h4 FROM src WHERE key = 100) a +WHERE a.h4 <= 3 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF) (TOK_SELEXPR (TOK_FUNCTION TOK_DOUBLE (TOK_FUNCTION rand)) randum123) (TOK_SELEXPR (TOK_FUNCTION hex 4) h4)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 100)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL randum123)) (TOK_SELEXPR (TOK_TABLE_OR_COL h4))) (TOK_WHERE (<= (. (TOK_TABLE_OR_COL a) h4) 3)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: rand() + type: double + expr: hex(4) + type: string + outputColumnNames: _col0, _col2, _col3 + Filter Operator + predicate: + expr: (_col3 <= 3) + type: boolean + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: double + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN +SELECT key,randum123, v10 +FROM (SELECT *, cast(rand() as double) AS randum123, value*10 AS v10 FROM src WHERE key = 100) a +WHERE a.v10 <= 200 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key,randum123, v10 +FROM (SELECT *, cast(rand() as double) AS randum123, value*10 AS v10 FROM src WHERE key = 100) a +WHERE a.v10 <= 200 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF) (TOK_SELEXPR (TOK_FUNCTION TOK_DOUBLE (TOK_FUNCTION rand)) randum123) (TOK_SELEXPR (* (TOK_TABLE_OR_COL value) 10) v10)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 100)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL randum123)) (TOK_SELEXPR (TOK_TABLE_OR_COL v10))) (TOK_WHERE (<= (. (TOK_TABLE_OR_COL a) v10) 200)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: rand() + type: double + expr: (value * 10) + type: double + outputColumnNames: _col0, _col2, _col3 + Filter Operator + predicate: + expr: (_col3 <= 200) + type: boolean + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: double + expr: _col3 + type: double + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN +SELECT key, randum123 +FROM (SELECT *, cast(rand() as double) AS randum123 FROM src WHERE key = 100) a +WHERE randum123 <=0.1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key, randum123 +FROM (SELECT *, cast(rand() as double) AS randum123 FROM src WHERE key = 100) a +WHERE randum123 <=0.1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF) (TOK_SELEXPR (TOK_FUNCTION TOK_DOUBLE (TOK_FUNCTION rand)) randum123)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 100)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL randum123))) (TOK_WHERE (<= (TOK_TABLE_OR_COL randum123) 0.1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: rand() + type: double + outputColumnNames: _col0, _col2 + Filter Operator + predicate: + expr: (_col2 <= 0.1) + type: boolean + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: double + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN +SELECT key,randum123, h4 +FROM (SELECT *, cast(rand() as double) AS randum123, hex(4) AS h4 FROM src WHERE key = 100) a +WHERE a.h4 <= 3 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key,randum123, h4 +FROM (SELECT *, cast(rand() as double) AS randum123, hex(4) AS h4 FROM src WHERE key = 100) a +WHERE a.h4 <= 3 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF) (TOK_SELEXPR (TOK_FUNCTION TOK_DOUBLE (TOK_FUNCTION rand)) randum123) (TOK_SELEXPR (TOK_FUNCTION hex 4) h4)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 100)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL randum123)) (TOK_SELEXPR (TOK_TABLE_OR_COL h4))) (TOK_WHERE (<= (. (TOK_TABLE_OR_COL a) h4) 3)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: rand() + type: double + expr: hex(4) + type: string + outputColumnNames: _col0, _col2, _col3 + Filter Operator + predicate: + expr: (_col3 <= 3) + type: boolean + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: double + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN +SELECT key,randum123, v10 +FROM (SELECT *, cast(rand() as double) AS randum123, value*10 AS v10 FROM src WHERE key = 100) a +WHERE a.v10 <= 200 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key,randum123, v10 +FROM (SELECT *, cast(rand() as double) AS randum123, value*10 AS v10 FROM src WHERE key = 100) a +WHERE a.v10 <= 200 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF) (TOK_SELEXPR (TOK_FUNCTION TOK_DOUBLE (TOK_FUNCTION rand)) randum123) (TOK_SELEXPR (* (TOK_TABLE_OR_COL value) 10) v10)) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 100)))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL randum123)) (TOK_SELEXPR (TOK_TABLE_OR_COL v10))) (TOK_WHERE (<= (. (TOK_TABLE_OR_COL a) v10) 200)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 100) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: rand() + type: double + expr: (value * 10) + type: double + outputColumnNames: _col0, _col2, _col3 + Filter Operator + predicate: + expr: (_col3 <= 200) + type: boolean + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: double + expr: _col3 + type: double + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + +