diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index da171b1..66c5560 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1190,6 +1190,10 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { "Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false."), HIVEPOINTLOOKUPOPTIMIZER("hive.optimize.point.lookup", true, "Whether to transform OR clauses in Filter operators into IN clauses"), + HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 31, + "Minimum number of OR clauses needed to transform into IN clauses"), + HIVEPOINTLOOKUPOPTIMIZEREXTRACT("hive.optimize.point.lookup.extract", true, + "Extract partial expressions when optimizing point lookup IN clauses"), // Constant propagation optimizer HIVEOPTCONSTANTPROPAGATION("hive.optimize.constant.propagation", true, "Whether to enable constant propagation optimizer"), HIVEIDENTITYPROJECTREMOVER("hive.optimize.remove.identity.project", true, "Removes identity project from operator tree"), diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 14f362f..0b79f3a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -68,6 +68,16 @@ public void initialize(HiveConf hiveConf) { // Add the transformation that computes the lineage information. transformations.add(new Generator()); + + // Try to transform OR predicates in Filter into simpler IN clauses first + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZER)) { + final int min = HiveConf.getIntVar(hiveConf, + HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZERMIN); + final boolean extract = HiveConf.getBoolVar(hiveConf, + HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZEREXTRACT); + transformations.add(new PointLookupOptimizer(min, extract)); + } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD)) { transformations.add(new PredicateTransitivePropagate()); if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTCONSTANTPROPAGATION)) { @@ -82,11 +92,6 @@ public void initialize(HiveConf hiveConf) { transformations.add(new ConstantPropagate()); } - // Try to transform OR predicates in Filter into IN clauses. - if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZER)) { - transformations.add(new PointLookupOptimizer()); - } - if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD)) { transformations.add(new PartitionPruner()); transformations.add(new PartitionConditionRemover()); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/PointLookupOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/PointLookupOptimizer.java index 6a8acec..ab5fd1c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/PointLookupOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/PointLookupOptimizer.java @@ -19,9 +19,11 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.Stack; import org.apache.calcite.util.Pair; @@ -46,9 +48,11 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -71,7 +75,20 @@ GenericUDFIn.class.getAnnotation(Description.class).name(); private static final String STRUCT_UDF = GenericUDFStruct.class.getAnnotation(Description.class).name(); - + private static final String AND_UDF = + GenericUDFOPAnd.class.getAnnotation(Description.class).name(); + + // these are closure-bound for all the walkers in context + public final int minOrExpr; + public final boolean extract; + + /* + * Pass in configs and pre-create a parse context + */ + public PointLookupOptimizer(final int min, final boolean extract) { + this.minOrExpr = min; + this.extract = extract; + } @Override public ParseContext transform(ParseContext pctx) throws SemanticException { @@ -140,8 +157,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } - // 2. It is an OR operator + // 2. It is an OR operator with enough children List children = fd.getChildren(); + if (children.size() < minOrExpr) { + return null; + } ListMultimap> columnConstantsMap = ArrayListMultimap.create(); boolean modeAnd = false; @@ -272,6 +292,50 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, newPredicate = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, FunctionRegistry.getFunctionInfo(IN_UDF).getGenericUDF(), newChildren); + if (extract && columns.size() > 1) { + final List subExpr = new ArrayList(columns.size()+1); + + // extract pre-conditions for the tuple expressions + // (a,b) IN ((1,2),(2,3)) -> + // ((a) IN (1,2) and b in (2,3)) and (a,b) IN ((1,2),(2,3)) + + for (String keyString : columnConstantsMap.keySet()) { + final Set valuesExpr = + new HashSet(children.size()); + final List> partial = + columnConstantsMap.get(keyString); + for (int i = 0; i < children.size(); i++) { + Pair columnConstant = partial + .get(i); + valuesExpr + .add(new ExprNodeDescEqualityWrapper(columnConstant.right)); + } + ExprNodeColumnDesc lookupCol = partial.get(0).left; + // generate a partial IN clause, if the column is a partition column + if (lookupCol.getIsPartitionColOrVirtualCol() + || valuesExpr.size() < children.size()) { + // optimize only nDV reductions + final List inExpr = new ArrayList(); + inExpr.add(lookupCol); + for (ExprNodeDescEqualityWrapper value : valuesExpr) { + inExpr.add(value.getExprNodeDesc()); + } + subExpr.add(new ExprNodeGenericFuncDesc( + TypeInfoFactory.booleanTypeInfo, FunctionRegistry + .getFunctionInfo(IN_UDF).getGenericUDF(), inExpr)); + } + } + // loop complete, inspect the sub expressions generated + if (subExpr.size() > 0) { + // add the newPredicate to the end & produce an AND clause + subExpr.add(newPredicate); + newPredicate = new ExprNodeGenericFuncDesc( + TypeInfoFactory.booleanTypeInfo, FunctionRegistry + .getFunctionInfo(AND_UDF).getGenericUDF(), subExpr); + } + // else, newPredicate is unmodified + } + return newPredicate; }