diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 9fd7dcab4c..69408f6de9 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2270,6 +2270,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Whether to transform OR clauses in Filter operators into IN clauses"), HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 2, "Minimum number of OR clauses needed to transform into IN clauses"), + HIVEOPT_TRANSFORM_IN_MAXNODES("hive.optimize.transform.in.maxnodes", 16, + "Maximum number of IN expressions beyond which IN will not be transformed into OR clause"), HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true, "Whether to transform count distinct into two stages"), HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java index 100ee0b2d2..c4e942222c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java @@ -59,6 +59,7 @@ import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.common.type.TimestampTZ; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; @@ -142,6 +143,7 @@ private InputCtx(RelDataType calciteInpDataType, ImmutableMap h private final RowResolver outerRR; private final ImmutableMap outerNameToPosMap; private int correlatedId; + private final HiveConf conf; //Constructor used by HiveRexExecutorImpl public RexNodeConverter(RelOptCluster cluster) { @@ -151,13 +153,15 @@ public RexNodeConverter(RelOptCluster cluster) { //subqueries will need outer query's row resolver public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType, ImmutableMap outerNameToPosMap, - ImmutableMap nameToPosMap, RowResolver hiveRR, RowResolver outerRR, int offset, boolean flattenExpr, int correlatedId) { + ImmutableMap nameToPosMap, RowResolver hiveRR, RowResolver outerRR, + HiveConf conf, int offset, boolean flattenExpr, int correlatedId) { this.cluster = cluster; this.inputCtxs = ImmutableList.of(new InputCtx(inpDataType, nameToPosMap, hiveRR, offset)); this.flattenExpr = flattenExpr; this.outerRR = outerRR; this.outerNameToPosMap = outerNameToPosMap; this.correlatedId = correlatedId; + this.conf = conf; } public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType, @@ -167,6 +171,7 @@ public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType, this.flattenExpr = flattenExpr; this.outerRR = null; this.outerNameToPosMap = null; + this.conf = null; } public RexNodeConverter(RelOptCluster cluster, List inpCtxLst, boolean flattenExpr) { @@ -175,6 +180,7 @@ public RexNodeConverter(RelOptCluster cluster, List inpCtxLst, boolean this.flattenExpr = flattenExpr; this.outerRR = null; this.outerNameToPosMap = null; + this.conf = null; } public RexNode convert(ExprNodeDesc expr) throws SemanticException { @@ -423,12 +429,24 @@ private RexNode convert(ExprNodeGenericFuncDesc func) throws SemanticException { // from IN [A,B] => EQUALS [A,B] // except complex types calciteOp = SqlStdOperatorTable.EQUALS; - } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)) { + } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)){ // if it is more than an single item in an IN clause, // transform from IN [A,B,C] => OR [EQUALS [A,B], EQUALS [A,C]] // except complex types - childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst); - calciteOp = SqlStdOperatorTable.OR; + // Rewrite to OR is done only if number of operands are less than + // the threshold configured + boolean rewriteToOr = true; + if(conf != null) { + final long maxNodes = HiveConf.getIntVar(conf, + HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES); + if(childRexNodeLst.size() > maxNodes) { + rewriteToOr = false; + } + } + if(rewriteToOr) { + childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst); + calciteOp = SqlStdOperatorTable.OR; + } } } else if (calciteOp.getKind() == SqlKind.COALESCE && childRexNodeLst.size() > 1) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 212d27a3bc..0e5d48e82b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -3195,7 +3195,7 @@ private RelNode genFilterRelNode(ASTNode filterExpr, RelNode srcRel, .get(srcRel); RexNode convertedFilterExpr = new RexNodeConverter(cluster, srcRel.getRowType(), outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), outerRR, - 0, true, subqueryId).convert(filterCondn); + conf,0, true, subqueryId).convert(filterCondn); RexNode factoredFilterExpr = RexUtil .pullFactors(cluster.getRexBuilder(), convertedFilterExpr); RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), @@ -3429,7 +3429,7 @@ private RelNode genFilterRelNode(QB qb, ASTNode searchCond, RelNode srcRel, .get(srcRel); RexNode convertedFilterLHS = new RexNodeConverter(cluster, srcRel.getRowType(), outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), - outerRR, 0, true, subqueryId).convert(subQueryExpr); + outerRR, conf, 0, true, subqueryId).convert(subQueryExpr); RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), srcRel, convertedFilterLHS); @@ -4696,7 +4696,7 @@ private void setQueryHints(QB qb) throws SemanticException { RexNodeConverter rexNodeConv = new RexNodeConverter(cluster, srcRel.getRowType(), outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel), - outerRR, 0, false, subqueryId); + outerRR, conf, 0, false, subqueryId); for (ExprNodeDesc colExpr : col_list) { calciteColLst.add(rexNodeConv.convert(colExpr)); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java index 0c81986c84..a4c1b9ab38 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java @@ -1220,16 +1220,26 @@ protected ExprNodeDesc getXpathOrFuncExprNodeDesc(ASTNode expr, } outputOpList.add(nullConst); } + if (!ctx.isCBOExecuted()) { - ArrayList orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children); - if (orOperands != null) { - if (orOperands.size() == 1) { - orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false)); + + HiveConf conf; + try { + conf = Hive.get().getConf(); + } catch (HiveException e) { + throw new SemanticException(e); + } + if( children.size() <= HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES)) { + ArrayList orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children); + if (orOperands != null) { + if (orOperands.size() == 1) { + orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false)); + } + funcText = "or"; + genericUDF = new GenericUDFOPOr(); + children.clear(); + children.addAll(orOperands); } - funcText = "or"; - genericUDF = new GenericUDFOPOr(); - children.clear(); - children.addAll(orOperands); } } } diff --git a/ql/src/test/queries/clientpositive/in_typecheck_char.q b/ql/src/test/queries/clientpositive/in_typecheck_char.q index 3955c4be14..a144d5190a 100644 --- a/ql/src/test/queries/clientpositive/in_typecheck_char.q +++ b/ql/src/test/queries/clientpositive/in_typecheck_char.q @@ -22,3 +22,15 @@ select 'expected 2',count(*) from ax where (s,t) in (('a','a'),(null, 'bb')); -- this is right now broken; HIVE-20779 should fix it explain select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null; select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null; + +set hive.optimize.point.lookup=false; +explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); +explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); + +set hive.optimize.transform.in.maxnodes=20; +explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); +explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); diff --git a/ql/src/test/results/clientpositive/in_typecheck_char.q.out b/ql/src/test/results/clientpositive/in_typecheck_char.q.out index cb9e777ad0..4d59ddf8eb 100644 --- a/ql/src/test/results/clientpositive/in_typecheck_char.q.out +++ b/ql/src/test/results/clientpositive/in_typecheck_char.q.out @@ -262,3 +262,149 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@ax #### A masked pattern was here #### expected 1 1 +PREHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[IN($1, _UTF-16LE'a ', _UTF-16LE'bb ', _UTF-16LE'aa ', _UTF-16LE'bbb ', _UTF-16LE'ab ', _UTF-16LE'ba ', _UTF-16LE'aaa ', _UTF-16LE'bbb ', _UTF-16LE'abc ', _UTF-16LE'bc ', _UTF-16LE'ac ', _UTF-16LE'bca ', _UTF-16LE'cab ', _UTF-16LE'cb ', _UTF-16LE'ca ', _UTF-16LE'cbc ', _UTF-16LE'z ')]) + HiveTableScan(table=[[default, ax]], table:alias=[ax]) + +PREHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: ax + filterExpr: (t) IN ('a ', 'bb ', 'aa ', 'bbb ', 'ab ', 'ba ', 'aaa ', 'bbb ', 'abc ', 'bc ', 'ac ', 'bca ', 'cab ', 'cb ', 'ca ', 'cbc ', 'z ') (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (t) IN ('a ', 'bb ', 'aa ', 'bbb ', 'ab ', 'ba ', 'aaa ', 'bbb ', 'abc ', 'bc ', 'ac ', 'bca ', 'cab ', 'cb ', 'ca ', 'cbc ', 'z ') (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[OR(=($1, _UTF-16LE'a '), =($1, _UTF-16LE'bb '), =($1, _UTF-16LE'aa '), =($1, _UTF-16LE'bbb '), =($1, _UTF-16LE'ab '), =($1, _UTF-16LE'ba '), =($1, _UTF-16LE'aaa '), =($1, _UTF-16LE'abc '), =($1, _UTF-16LE'bc '), =($1, _UTF-16LE'ac '), =($1, _UTF-16LE'bca '), =($1, _UTF-16LE'cab '), =($1, _UTF-16LE'cb '), =($1, _UTF-16LE'ca '), =($1, _UTF-16LE'cbc '), =($1, _UTF-16LE'z '))]) + HiveTableScan(table=[[default, ax]], table:alias=[ax]) + +PREHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: ax + filterExpr: ((t = 'a ') or (t = 'bb ') or (t = 'aa ') or (t = 'bbb ') or (t = 'ab ') or (t = 'ba ') or (t = 'aaa ') or (t = 'abc ') or (t = 'bc ') or (t = 'ac ') or (t = 'bca ') or (t = 'cab ') or (t = 'cb ') or (t = 'ca ') or (t = 'cbc ') or (t = 'z ')) (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((t = 'a ') or (t = 'bb ') or (t = 'aa ') or (t = 'bbb ') or (t = 'ab ') or (t = 'ba ') or (t = 'aaa ') or (t = 'abc ') or (t = 'bc ') or (t = 'ac ') or (t = 'bca ') or (t = 'cab ') or (t = 'cb ') or (t = 'ca ') or (t = 'cbc ') or (t = 'z ')) (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +