From 9175a1b4042d6ba5b8063a9c2e33437b8583ad22 Mon Sep 17 00:00:00 2001 From: Gopal V Date: Wed, 6 Jul 2016 12:54:05 -0700 Subject: [PATCH] first cut + test-cases --- itests/custom-udfs/pom.xml | 1 + .../custom-udfs/udf-vectorized-badexample/pom.xml | 43 ++++++++++++ .../java/hive/it/custom/udfs/GenericUDFRot13.java | 32 +++++++++ .../it/custom/udfs/vector/VectorStringRot13.java | 46 +++++++++++++ .../hive/ql/exec/vector/VectorizationContext.java | 34 +++++----- ql/src/test/queries/clientpositive/vector_udf3.q | 13 ++++ .../test/results/clientpositive/vector_udf3.q.out | 76 ++++++++++++++++++++++ 7 files changed, 229 insertions(+), 16 deletions(-) create mode 100644 itests/custom-udfs/udf-vectorized-badexample/pom.xml create mode 100644 itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java create mode 100644 itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java create mode 100644 ql/src/test/queries/clientpositive/vector_udf3.q create mode 100644 ql/src/test/results/clientpositive/vector_udf3.q.out diff --git itests/custom-udfs/pom.xml itests/custom-udfs/pom.xml index 3e7443c..b230b41 100644 --- itests/custom-udfs/pom.xml +++ itests/custom-udfs/pom.xml @@ -42,6 +42,7 @@ limitations under the License. udf-classloader-util udf-classloader-udf1 udf-classloader-udf2 + udf-vectorized-badexample diff --git itests/custom-udfs/udf-vectorized-badexample/pom.xml itests/custom-udfs/udf-vectorized-badexample/pom.xml new file mode 100644 index 0000000..35c1a2f --- /dev/null +++ itests/custom-udfs/udf-vectorized-badexample/pom.xml @@ -0,0 +1,43 @@ + + + + 4.0.0 + + org.apache.hive + hive-it-custom-udfs + 2.2.0-SNAPSHOT + ../pom.xml + + + org.apache.hive.hive-it-custom-udfs + udf-vectorized-badexample + jar + Hive Integration - Custom UDFs - udf-vectorized-badexample + + + + org.apache.hive.hive-it-custom-udfs + udf-classloader-util + ${project.version} + + + + + ../../.. + + + diff --git itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java new file mode 100644 index 0000000..8941175 --- /dev/null +++ itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java @@ -0,0 +1,32 @@ +package hive.it.custom.udfs; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import hive.it.custom.udfs.vector.VectorStringRot13; + +@VectorizedExpressions(value = { VectorStringRot13.class }) +public class GenericUDFRot13 extends GenericUDF { + + @Override + public Object evaluate(DeferredObject[] arg0) throws HiveException { + /* this is the bad part - the vectorized UDF returns the right result */ + return new Text("Unvectorized"); + } + + @Override + public String getDisplayString(String[] arg0) { + return String.format("Rot13(%s)", arg0[0]); + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arg0) + throws UDFArgumentException { + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } + +} diff --git itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java new file mode 100644 index 0000000..7fbfe32 --- /dev/null +++ itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java @@ -0,0 +1,46 @@ +package hive.it.custom.udfs.vector; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUnaryUDF; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUnaryUDFDirect; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.orc.impl.TreeReaderFactory.BytesColumnVectorUtil; + +public class VectorStringRot13 extends StringUnaryUDFDirect { + + public VectorStringRot13(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + } + + public VectorStringRot13() { + super(); + } + + @Override + protected void func(BytesColumnVector outV, byte[][] vector, int[] start, + int[] length, int i) { + int off = start[i]; + int len = length[i]; + byte[] src = vector[i]; + byte[] dst = new byte[len]; + for (int j = 0; j < len ; j++) { + dst[j] = rot13(src[off+j]); + } + outV.setVal(i, dst, 0, length[i]); + } + + private byte rot13(byte b) { + if (b >= 'a' && b <= 'm' || b >= 'A' && b <= 'M' ) { + return (byte) (b+13); + } + if (b >= 'n' && b <= 'z' || b >= 'N' && b <= 'Z') { + return (byte) (b-13); + } + return b; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 9de1833..adf447d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -479,24 +479,18 @@ public VectorExpression getVectorExpression(ExprNodeDesc exprDesc, VectorExpress ve = getColumnVectorExpression((ExprNodeColumnDesc) exprDesc, mode); } else if (exprDesc instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc; - if (isCustomUDF(expr)) { - ve = getCustomUDFExpression(expr, mode); - } else { - // Add cast expression if needed. Child expressions of a udf may return different data types // and that would require converting their data types to evaluate the udf. // For example decimal column added to an integer column would require integer column to be // cast to decimal. - List childExpressions = getChildExpressionsWithImplicitCast(expr.getGenericUDF(), + // Note: this is a no-op for custom UDFs + List childExpressions = getChildExpressionsWithImplicitCast(expr.getGenericUDF(), exprDesc.getChildren(), exprDesc.getTypeInfo()); - ve = getGenericUdfVectorExpression(expr.getGenericUDF(), - childExpressions, mode, exprDesc.getTypeInfo()); - if (ve == null) { - /* - * Ok, no vectorized class available. No problem -- try to use the VectorUDFAdaptor. - */ - ve = getCustomUDFExpression(expr, mode); - } + ve = getGenericUdfVectorExpression(expr.getGenericUDF(), childExpressions, mode, + exprDesc.getTypeInfo()); + if (ve == null) { + /* Ok, no vectorized class available. No problem -- try to use the VectorUDFAdaptor. */ + ve = getCustomUDFExpression(expr, mode); } } else if (exprDesc instanceof ExprNodeConstantDesc) { ve = getConstantVectorExpression(((ExprNodeConstantDesc) exprDesc).getValue(), exprDesc.getTypeInfo(), @@ -562,8 +556,13 @@ private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf, */ private List getChildExpressionsWithImplicitCast(GenericUDF genericUDF, List children, TypeInfo returnType) throws HiveException { - if (isExcludedFromCast(genericUDF)) { + if (isCustomUDF(genericUDF.getUdfName())) { + // no implicit casts possible + return children; + } + + if (isExcludedFromCast(genericUDF)) { // No implicit cast needed return children; } @@ -800,9 +799,12 @@ public static String arg0Type(ExprNodeGenericFuncDesc expr) { } // Return true if this is a custom UDF or custom GenericUDF. - // This is for use only in the planner. It will fail in a task. + // This two functions are for use only in the planner. It will fail in a task. public static boolean isCustomUDF(ExprNodeGenericFuncDesc expr) { - String udfName = expr.getFuncText(); + return isCustomUDF(expr.getFuncText()); + } + + private static boolean isCustomUDF(String udfName) { if (udfName == null) { return false; } diff --git ql/src/test/queries/clientpositive/vector_udf3.q ql/src/test/queries/clientpositive/vector_udf3.q new file mode 100644 index 0000000..8a4df79 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_udf3.q @@ -0,0 +1,13 @@ +ADD JAR ivy://org.apache.hive.hive-it-custom-udfs:udf-vectorized-badexample:+; + +CREATE TEMPORARY FUNCTION rot13 as 'hive.it.custom.udfs.GenericUDFRot13'; + +set hive.vectorized.execution.enabled=true; + +EXPLAIN SELECT rot13(cstring1) from alltypesorc; + +SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10; + +set hive.vectorized.execution.enabled=false; + +SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10; diff --git ql/src/test/results/clientpositive/vector_udf3.q.out ql/src/test/results/clientpositive/vector_udf3.q.out new file mode 100644 index 0000000..7c6a90a --- /dev/null +++ ql/src/test/results/clientpositive/vector_udf3.q.out @@ -0,0 +1,76 @@ +PREHOOK: query: CREATE TEMPORARY FUNCTION rot13 as 'hive.it.custom.udfs.GenericUDFRot13' +PREHOOK: type: CREATEFUNCTION +PREHOOK: Output: rot13 +POSTHOOK: query: CREATE TEMPORARY FUNCTION rot13 as 'hive.it.custom.udfs.GenericUDFRot13' +POSTHOOK: type: CREATEFUNCTION +POSTHOOK: Output: rot13 +PREHOOK: query: EXPLAIN SELECT rot13(cstring1) from alltypesorc +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT rot13(cstring1) from alltypesorc +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: Rot13(cstring1) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +yy2GiGM ll2TvTZ +yxN0212hM17E8J8bJj8D7b lkA0212uZ17R8W8oWw8Q7o +ywA68u76Jv06axCv451avL4 ljN68h76Wi06nkPi451niY4 +yvNv1q liAi1d +yv3gnG4a33hD7bIm7oxE5rw li3taT4n33uQ7oVz7bkR5ej +yv1js li1wf +yujO07KWj lhwB07XJw +ytpx1RL8F2I lgck1EY8S2V +ytj7g5W lgw7t5J +ytgaJW1Gvrkv5wFUJU2y1S lgtnWJ1Tiexi5jSHWH2l1F +PREHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +yy2GiGM Unvectorized +yxN0212hM17E8J8bJj8D7b Unvectorized +ywA68u76Jv06axCv451avL4 Unvectorized +yvNv1q Unvectorized +yv3gnG4a33hD7bIm7oxE5rw Unvectorized +yv1js Unvectorized +yujO07KWj Unvectorized +ytpx1RL8F2I Unvectorized +ytj7g5W Unvectorized +ytgaJW1Gvrkv5wFUJU2y1S Unvectorized -- 2.4.0