From 0d8ca52ad04c848c545d123d50939583bb9425ae Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Sat, 2 May 2015 13:17:47 -0700 Subject: [PATCH] HIVE-9644 : CASE comparison operator rotation optimization --- .../ql/optimizer/ConstantPropagateProcFactory.java | 75 ++++- ql/src/test/queries/clientpositive/fold_case.q | 12 + ql/src/test/results/clientpositive/fold_case.q.out | 301 +++++++++++++++++++++ 3 files changed, 386 insertions(+), 2 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/fold_case.q create mode 100644 ql/src/test/results/clientpositive/fold_case.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java index e9436e5..1a7b279 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; @@ -65,12 +66,16 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCase; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; @@ -79,11 +84,14 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantBooleanObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BooleanWritable; import com.google.common.collect.ImmutableSet; +import com.sun.tools.javap.ConstantWriter; /** * Factory for generating the different node processors used by ConstantPropagate. @@ -199,10 +207,11 @@ public static ExprNodeDesc foldExpr(ExprNodeGenericFuncDesc funcDesc) { * @param op processing operator * @param propagate if true, assignment expressions will be added to constants. * @return fold expression + * @throws UDFArgumentException */ private static ExprNodeDesc foldExpr(ExprNodeDesc desc, Map constants, ConstantPropagateProcCtx cppCtx, Operator op, int tag, - boolean propagate) { + boolean propagate) throws UDFArgumentException { if (desc instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) desc; @@ -356,7 +365,7 @@ private static ExprNodeColumnDesc getColumnExpr(ExprNodeDesc expr) { return (expr instanceof ExprNodeColumnDesc) ? (ExprNodeColumnDesc)expr : null; } - private static ExprNodeDesc shortcutFunction(GenericUDF udf, List newExprs) { + private static ExprNodeDesc shortcutFunction(GenericUDF udf, List newExprs) throws UDFArgumentException { if (udf instanceof GenericUDFOPAnd) { for (int i = 0; i < 2; i++) { ExprNodeDesc childExpr = newExprs.get(i); @@ -407,9 +416,71 @@ private static ExprNodeDesc shortcutFunction(GenericUDF udf, List } } + if (udf instanceof GenericUDFCase) { + // HIVE-9644 Attempt to fold expression like : + // where (case ss_sold_date when '1998-01-01' then 1=1 else null=1 end); + // where ss_sold_date= '1998-01-01' ; + if (!(newExprs.size() == 3 || newExprs.size() == 4)) { + // In general case can have unlimited # of branches, + // we currently only handle either 1 or 2 branch. + return null; + } + Boolean[] values = new Boolean[2]; // holds constant boolean value of exprs, if exists. + ExprNodeDesc n = newExprs.get(2); + if (n instanceof ExprNodeConstantDesc) { + values[0] = getBoolValOf((ExprNodeConstantDesc)n); + if (null == values[0]) { + //we failed to determine boolean value of this constant as evaluated by Hive. + return null; + } + } else if (n instanceof ExprNodeNullDesc) { + // for folding purposes, null is as good as false. + values[0] = Boolean.FALSE; + } else { + // non-constant expression. + return null; + } + + if (newExprs.size() == 3) { + // if else branch is missing, it is treated as false. + values[1] = Boolean.FALSE; + } else if (newExprs.get(3) instanceof ExprNodeConstantDesc) { + values[1] = getBoolValOf((ExprNodeConstantDesc)newExprs.get(3)); + if (null == values[1]) { + return null; + } + } else if (newExprs.get(3) instanceof ExprNodeNullDesc) { + values[1] = Boolean.FALSE; + } else { + return null; + } + + if ((Boolean.TRUE.equals(values[0]) && Boolean.TRUE.equals(values[1])) || + (Boolean.FALSE.equals(values[0]) && Boolean.FALSE.equals(values[1]))) { + return newExprs.get(2); + } else if (Boolean.TRUE.equals(values[0])) { + return ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPEqual(), newExprs.subList(0, 2)); + } else { + return ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPNotEqual(), newExprs.subList(0, 2)); + } + } + return null; } + private static Boolean getBoolValOf(ExprNodeConstantDesc n) throws UDFArgumentException { + List constExprs = new ArrayList(2); + constExprs.add(n); + constExprs.add(new ExprNodeConstantDesc(Boolean.TRUE)); + ObjectInspector oi = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPEqual(), constExprs).getWritableObjectInspector(); + if (oi instanceof WritableConstantBooleanObjectInspector) { + BooleanWritable bw = ((WritableConstantBooleanObjectInspector)oi).getWritableConstantValue(); + // for folding purposes, null is as good as false. + return null == bw ? Boolean.FALSE : bw.get(); + } else { + return null; + } + } /** * Evaluate column, replace the deterministic columns with constants if possible * diff --git a/ql/src/test/queries/clientpositive/fold_case.q b/ql/src/test/queries/clientpositive/fold_case.q new file mode 100644 index 0000000..3f9e3a3 --- /dev/null +++ b/ql/src/test/queries/clientpositive/fold_case.q @@ -0,0 +1,12 @@ +explain +select count(1) from src where (case key when '238' then true else false end); +explain +select count(1) from src where (case key when '238' then 1=2 else 1=1 end); +explain +select count(1) from src where (case key when '238' then 1=2 else 1=31 end); +explain +select count(1) from src where (case key when '238' then true else 1=1 end); +explain +select count(1) from src where (case key when '238' then 1=1 else 1=null end); +explain +select count(1) from src where (case key when '238' then null else 1=1 end); diff --git a/ql/src/test/results/clientpositive/fold_case.q.out b/ql/src/test/results/clientpositive/fold_case.q.out new file mode 100644 index 0000000..25cc764 --- /dev/null +++ b/ql/src/test/results/clientpositive/fold_case.q.out @@ -0,0 +1,301 @@ +PREHOOK: query: explain +select count(1) from src where (case key when '238' then true else false end) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from src where (case key when '238' then true else false end) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = '238') (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain +select count(1) from src where (case key when '238' then 1=2 else 1=1 end) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from src where (case key when '238' then 1=2 else 1=1 end) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key <> '238') (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain +select count(1) from src where (case key when '238' then 1=2 else 1=31 end) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from src where (case key when '238' then 1=2 else 1=31 end) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: false (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain +select count(1) from src where (case key when '238' then true else 1=1 end) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from src where (case key when '238' then true else 1=1 end) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain +select count(1) from src where (case key when '238' then 1=1 else 1=null end) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from src where (case key when '238' then 1=1 else 1=null end) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = '238') (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain +select count(1) from src where (case key when '238' then null else 1=1 end) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(1) from src where (case key when '238' then null else 1=1 end) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key <> '238') (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + -- 1.7.12.4 (Apple Git-37)