diff --git a/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java b/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java index 5b59291..2ee0451 100644 --- a/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java +++ b/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java @@ -356,6 +356,26 @@ {"ColumnUnaryFunc", "FuncSign", "double", "double", "MathExpr.sign", "", "", ""}, {"ColumnUnaryFunc", "FuncSign", "double", "long", "MathExpr.sign", "(double)", "", ""}, + // Casts + {"ColumnUnaryFunc", "Cast", "long", "double", "", "", "(long)", ""}, + {"ColumnUnaryFunc", "Cast", "double", "long", "", "", "(double)", ""}, + {"ColumnUnaryFunc", "CastTimestampToLongVia", "long", "long", "MathExpr.fromTimestamp", "", + "", ""}, + {"ColumnUnaryFunc", "CastTimestampToDoubleVia", "double", "long", + "MathExpr.fromTimestampToDouble", "", "", ""}, + {"ColumnUnaryFunc", "CastDoubleToBooleanVia", "long", "double", "MathExpr.toBool", "", + "", ""}, + {"ColumnUnaryFunc", "CastLongToBooleanVia", "long", "long", "MathExpr.toBool", "", + "", ""}, + {"ColumnUnaryFunc", "CastLongToTimestampVia", "long", "long", "MathExpr.longToTimestamp", "", + "", ""}, + {"ColumnUnaryFunc", "CastDoubleToTimestampVia", "long", "double", + "MathExpr.doubleToTimestamp", "", "", ""}, + + // Boolean to long is done with an IdentityExpression + // Boolean to double is done with standard Long to Double cast + // See org.apache.hadoop.hive.ql.exec.vector.expressions for remaining cast VectorExpression + // classes {"ColumnUnaryMinus", "long"}, {"ColumnUnaryMinus", "double"}, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 8f10644..a3565c4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -39,11 +39,11 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprAndExpr; import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprOrExpr; import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar; -import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColRegExpStringScalar; -import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.FuncRand; import org.apache.hadoop.hive.ql.exec.vector.expressions.ISetDoubleArg; import org.apache.hadoop.hive.ql.exec.vector.expressions.ISetLongArg; +import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColRegExpStringScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsNotNull; import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsNull; import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsTrue; @@ -75,6 +75,7 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarPopLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarSampDouble; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarSampLong; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.CastLongToBooleanViaLongToLong; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFArgDesc; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -84,11 +85,27 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcat; +import org.apache.hadoop.hive.ql.udf.UDFAcos; +import org.apache.hadoop.hive.ql.udf.UDFAsin; +import org.apache.hadoop.hive.ql.udf.UDFAtan; +import org.apache.hadoop.hive.ql.udf.UDFBin; +import org.apache.hadoop.hive.ql.udf.UDFCeil; +import org.apache.hadoop.hive.ql.udf.UDFConv; +import org.apache.hadoop.hive.ql.udf.UDFCos; import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth; +import org.apache.hadoop.hive.ql.udf.UDFDegrees; +import org.apache.hadoop.hive.ql.udf.UDFExp; +import org.apache.hadoop.hive.ql.udf.UDFFloor; +import org.apache.hadoop.hive.ql.udf.UDFHex; import org.apache.hadoop.hive.ql.udf.UDFHour; import org.apache.hadoop.hive.ql.udf.UDFLTrim; import org.apache.hadoop.hive.ql.udf.UDFLength; import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.hive.ql.udf.UDFLn; +import org.apache.hadoop.hive.ql.udf.UDFLog; +import org.apache.hadoop.hive.ql.udf.UDFLog10; +import org.apache.hadoop.hive.ql.udf.UDFLog2; import org.apache.hadoop.hive.ql.udf.UDFMinute; import org.apache.hadoop.hive.ql.udf.UDFMonth; import org.apache.hadoop.hive.ql.udf.UDFOPDivide; @@ -98,31 +115,32 @@ import org.apache.hadoop.hive.ql.udf.UDFOPNegative; import org.apache.hadoop.hive.ql.udf.UDFOPPlus; import org.apache.hadoop.hive.ql.udf.UDFOPPositive; +import org.apache.hadoop.hive.ql.udf.UDFPower; import org.apache.hadoop.hive.ql.udf.UDFRegExp; import org.apache.hadoop.hive.ql.udf.UDFRTrim; +import org.apache.hadoop.hive.ql.udf.UDFRadians; +import org.apache.hadoop.hive.ql.udf.UDFRand; +import org.apache.hadoop.hive.ql.udf.UDFRound; import org.apache.hadoop.hive.ql.udf.UDFSecond; +import org.apache.hadoop.hive.ql.udf.UDFSign; +import org.apache.hadoop.hive.ql.udf.UDFSin; +import org.apache.hadoop.hive.ql.udf.UDFSqrt; import org.apache.hadoop.hive.ql.udf.UDFSubstr; +import org.apache.hadoop.hive.ql.udf.UDFTan; +import org.apache.hadoop.hive.ql.udf.UDFToBoolean; +import org.apache.hadoop.hive.ql.udf.UDFToByte; +import org.apache.hadoop.hive.ql.udf.UDFToInteger; +import org.apache.hadoop.hive.ql.udf.UDFToLong; +import org.apache.hadoop.hive.ql.udf.UDFToShort; +import org.apache.hadoop.hive.ql.udf.UDFToFloat; +import org.apache.hadoop.hive.ql.udf.UDFToDouble; +import org.apache.hadoop.hive.ql.udf.UDFToString; import org.apache.hadoop.hive.ql.udf.UDFTrim; import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; -import org.apache.hadoop.hive.ql.udf.UDFAcos; -import org.apache.hadoop.hive.ql.udf.UDFAsin; -import org.apache.hadoop.hive.ql.udf.UDFAtan; -import org.apache.hadoop.hive.ql.udf.UDFBin; -import org.apache.hadoop.hive.ql.udf.UDFCeil; -import org.apache.hadoop.hive.ql.udf.UDFConv; -import org.apache.hadoop.hive.ql.udf.UDFCos; -import org.apache.hadoop.hive.ql.udf.UDFDegrees; -import org.apache.hadoop.hive.ql.udf.UDFExp; -import org.apache.hadoop.hive.ql.udf.UDFFloor; -import org.apache.hadoop.hive.ql.udf.UDFHex; -import org.apache.hadoop.hive.ql.udf.UDFLn; -import org.apache.hadoop.hive.ql.udf.UDFLog; -import org.apache.hadoop.hive.ql.udf.UDFLog10; -import org.apache.hadoop.hive.ql.udf.UDFLog2; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcat; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; @@ -135,18 +153,9 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUpper; -import org.apache.hadoop.hive.ql.udf.UDFPower; -import org.apache.hadoop.hive.ql.udf.UDFRadians; -import org.apache.hadoop.hive.ql.udf.UDFRand; -import org.apache.hadoop.hive.ql.udf.UDFRound; -import org.apache.hadoop.hive.ql.udf.UDFSign; -import org.apache.hadoop.hive.ql.udf.UDFSin; -import org.apache.hadoop.hive.ql.udf.UDFSqrt; -import org.apache.hadoop.hive.ql.udf.UDFTan; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs; - import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @@ -321,13 +330,41 @@ public static boolean isLegacyPathUDF(ExprNodeGenericFuncDesc expr) { GenericUDFBridge bridge = (GenericUDFBridge) gudf; Class udfClass = bridge.getUdfClass(); if (udfClass.equals(UDFHex.class) - || udfClass.equals(UDFConv.class)) { + || udfClass.equals(UDFConv.class) + || isCastToIntFamily(udfClass) && arg0Type(expr).equals("string") + || isCastToFloatFamily(udfClass) && arg0Type(expr).equals("string") + || udfClass.equals(UDFToString.class) && + (arg0Type(expr).equals("timestamp") + || arg0Type(expr).equals("double") + || arg0Type(expr).equals("float"))) { return true; } + } else if (gudf instanceof GenericUDFTimestamp && arg0Type(expr).equals("string")) { + return true; } return false; } + public static boolean isCastToIntFamily(Class udfClass) { + return udfClass.equals(UDFToByte.class) + || udfClass.equals(UDFToShort.class) + || udfClass.equals(UDFToInteger.class) + || udfClass.equals(UDFToLong.class); + + // Boolean is purposely excluded. + } + + public static boolean isCastToFloatFamily(Class udfClass) { + return udfClass.equals(UDFToDouble.class) + || udfClass.equals(UDFToFloat.class); + } + + // Return the type string of the first argument (argument 0). + public static String arg0Type(ExprNodeGenericFuncDesc expr) { + String type = expr.getChildExprs().get(0).getTypeString(); + return type; + } + // Return true if this is a custom UDF or custom GenericUDF. // This is for use only in the planner. It will fail in a task. public static boolean isCustomUDF(ExprNodeGenericFuncDesc expr) { @@ -490,7 +527,9 @@ private VectorExpression getUnaryFunctionExpression( return expr; } - private VectorExpression getUnaryPlusExpression(List childExprList) + // Used as a fast path for operations that don't modify their input, like unary + + // and casting boolean to long. + private VectorExpression getIdentityExpression(List childExprList) throws HiveException { ExprNodeDesc childExpr = childExprList.get(0); int inputCol; @@ -550,6 +589,8 @@ private VectorExpression getVectorExpression(GenericUDF udf, return getConcatExpression(childExpr); } else if (udf instanceof GenericUDFAbs) { return getUnaryAbsExpression(childExpr); + } else if (udf instanceof GenericUDFTimestamp) { + return getCastToTimestamp(childExpr); } throw new HiveException("Udf: "+udf.getClass().getSimpleName()+", is not supported"); @@ -607,7 +648,7 @@ private VectorExpression getVectorExpression(GenericUDFBridge udf, } else if (cl.equals(UDFOPNegative.class)) { return getUnaryMinusExpression(childExpr); } else if (cl.equals(UDFOPPositive.class)) { - return getUnaryPlusExpression(childExpr); + return getIdentityExpression(childExpr); } else if (cl.equals(UDFYear.class) || cl.equals(UDFMonth.class) || cl.equals(UDFWeekOfYear.class) || @@ -688,11 +729,116 @@ private VectorExpression getVectorExpression(GenericUDFBridge udf, return getRandExpression(childExpr); } else if (cl.equals(UDFBin.class)) { return getUnaryStringExpression("FuncBin", "String", childExpr); + } else if (isCastToIntFamily(cl)) { + return getCastToLongExpression(childExpr); + } else if (cl.equals(UDFToBoolean.class)) { + return getCastToBoolean(childExpr); + } else if (isCastToFloatFamily(cl)) { + return getCastToDoubleExpression(childExpr); + } else if (cl.equals(UDFToString.class)) { + return getCastToString(childExpr); } throw new HiveException("Udf: "+udf.getClass().getSimpleName()+", is not supported"); } + private VectorExpression getCastToTimestamp(List childExpr) + throws HiveException { + String inputType = childExpr.get(0).getTypeString(); + if (isIntFamily(inputType)) { + return getUnaryFunctionExpression("CastLongToTimestampVia", "Long", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (isFloatFamily(inputType)) { + return getUnaryFunctionExpression("CastDoubleToTimestampVia", "Long", childExpr, + GENERATED_EXPR_PACKAGE); + } + // The string type is deliberately omitted -- it's handled elsewhere. See isLegacyPathUDF. + + throw new HiveException("Unhandled cast input type: " + inputType); + } + + private VectorExpression getCastToString(List childExpr) + throws HiveException { + String inputType = childExpr.get(0).getTypeString(); + if (inputType.equals("boolean")) { + // Boolean must come before the integer family. It's a special case. + return getUnaryFunctionExpression("CastBooleanToStringVia", "String", childExpr, + CUSTOM_EXPR_PACKAGE); + } else if (isIntFamily(inputType)) { + return getUnaryFunctionExpression("Cast", "String", childExpr, + CUSTOM_EXPR_PACKAGE); + } + /* The string type is deliberately omitted -- the planner removes string to string casts. + * Timestamp, float, and double types are handled by the legacy code path. See isLegacyPathUDF. + */ + + throw new HiveException("Unhandled cast input type: " + inputType); + } + + private VectorExpression getCastToDoubleExpression(List childExpr) + throws HiveException { + String inputType = childExpr.get(0).getTypeString(); + if (isIntFamily(inputType)) { + return getUnaryFunctionExpression("Cast", "Double", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (inputType.equals("timestamp")) { + return getUnaryFunctionExpression("CastTimestampToDoubleVia", "Double", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (isFloatFamily(inputType)) { + + // float types require no conversion, so use a no-op + return getIdentityExpression(childExpr); + } + // The string type is deliberately omitted -- it's handled elsewhere. See isLegacyPathUDF. + + throw new HiveException("Unhandled cast input type: " + inputType); + } + + private VectorExpression getCastToBoolean(List childExpr) + throws HiveException { + String inputType = childExpr.get(0).getTypeString(); + if (isFloatFamily(inputType)) { + return getUnaryFunctionExpression("CastDoubleToBooleanVia", "Long", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (isIntFamily(inputType) || inputType.equals("timestamp")) { + return getUnaryFunctionExpression("CastLongToBooleanVia", "Long", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (inputType.equals("string")) { + + // string casts to false if it is 0 characters long, otherwise true + VectorExpression lenExpr = getUnaryStringExpression("StringLength", "Long", childExpr); + + int outputCol = ocm.allocateOutputColumn("integer"); + VectorExpression lenToBoolExpr = + new CastLongToBooleanViaLongToLong(lenExpr.getOutputColumn(), outputCol); + lenToBoolExpr.setChildExpressions(new VectorExpression[] {lenExpr}); + ocm.freeOutputColumn(lenExpr.getOutputColumn()); + return lenToBoolExpr; + } + // cast(booleanExpr as boolean) case is omitted because planner removes it as a no-op + + throw new HiveException("Unhandled cast input type: " + inputType); + } + + private VectorExpression getCastToLongExpression(List childExpr) + throws HiveException { + String inputType = childExpr.get(0).getTypeString(); + if (isFloatFamily(inputType)) { + return getUnaryFunctionExpression("Cast", "Long", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (inputType.equals("timestamp")) { + return getUnaryFunctionExpression("CastTimestampToLongVia", "Long", childExpr, + GENERATED_EXPR_PACKAGE); + } else if (isIntFamily(inputType)) { + + // integer and boolean types require no conversion, so use a no-op + return getIdentityExpression(childExpr); + } + // string type is deliberately omitted -- it's handled elsewhere. See isLegacyPathUDF. + + throw new HiveException("Unhandled cast input type: " + inputType); + } + private VectorExpression getRandExpression(List childExpr) throws HiveException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastBooleanToStringViaLongToString.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastBooleanToStringViaLongToString.java new file mode 100644 index 0000000..d16bbb1 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastBooleanToStringViaLongToString.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +public class CastBooleanToStringViaLongToString extends LongToStringUnaryUDF { + private static final long serialVersionUID = 1L; + private transient byte[] temp; // space to put date string + private static final byte[][] dictionary = { {'F', 'A', 'L', 'S', 'E'}, {'T', 'R', 'U', 'E'} }; + + public CastBooleanToStringViaLongToString() { + super(); + temp = new byte[8]; + } + + public CastBooleanToStringViaLongToString(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + temp = new byte[8]; + } + + @Override + protected void func(BytesColumnVector outV, long[] vector, int i) { + + /* 0 is false and 1 is true in the input vector, so a simple dictionary is used + * with two entries. 0 references FALSE and 1 references TRUE in the dictionary. + */ + outV.setVal(i, dictionary[(int) vector[i]], 0, dictionary[(int) vector[i]].length); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToString.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToString.java new file mode 100644 index 0000000..43bdfc2 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToString.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +public class CastLongToString extends LongToStringUnaryUDF { + private static final long serialVersionUID = 1L; + protected transient byte[] temp; // temporary location for building number string + + public CastLongToString() { + super(); + temp = new byte[20]; + } + + public CastLongToString(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + temp = new byte[20]; + } + + @Override + protected void func(BytesColumnVector outV, long[] vector, int i) { + int len = MathExpr.writeLongToUTF8(temp, vector[i]); + outV.setVal(i, temp, 0, len); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/LongToStringUnaryUDF.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/LongToStringUnaryUDF.java new file mode 100644 index 0000000..3f3da00 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/LongToStringUnaryUDF.java @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * This is a superclass for unary long functions returning strings that operate directly on the + * input and set the output. + */ +abstract public class LongToStringUnaryUDF extends VectorExpression { + private static final long serialVersionUID = 1L; + int inputColumn; + int outputColumn; + + public LongToStringUnaryUDF(int inputColumn, int outputColumn) { + this.inputColumn = inputColumn; + this.outputColumn = outputColumn; + } + + public LongToStringUnaryUDF() { + super(); + } + + abstract protected void func(BytesColumnVector outV, long[] vector, int i); + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + LongColumnVector inputColVector = (LongColumnVector) batch.cols[inputColumn]; + int[] sel = batch.selected; + int n = batch.size; + long[] vector = inputColVector.vector; + BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn]; + outV.initBuffer(); + + if (n == 0) { + //Nothing to do + return; + } + + if (inputColVector.noNulls) { + outV.noNulls = true; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + func(outV, vector, 0); + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + func(outV, vector, i); + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + func(outV, vector, i); + } + outV.isRepeating = false; + } + } else { + + // Handle case with nulls. Don't do function if the value is null, + // because the data may be undefined for a null value. + outV.noNulls = false; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.isNull[0] = inputColVector.isNull[0]; + if (!inputColVector.isNull[0]) { + func(outV, vector, 0); + } + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.isNull[i] = inputColVector.isNull[i]; + if (!inputColVector.isNull[i]) { + func(outV, vector, i); + } + } + outV.isRepeating = false; + } else { + System.arraycopy(inputColVector.isNull, 0, outV.isNull, 0, n); + for(int i = 0; i != n; i++) { + if (!inputColVector.isNull[i]) { + func(outV, vector, i); + } + } + outV.isRepeating = false; + } + } + } + + + @Override + public int getOutputColumn() { + return outputColumn; + } + + public void setOutputColumn(int outputColumn) { + this.outputColumn = outputColumn; + } + + public int getInputColumn() { + return inputColumn; + } + + public void setInputColumn(int inputColumn) { + this.inputColumn = inputColumn; + } + + @Override + public String getOutputType() { + return "String"; + } + +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/MathExpr.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/MathExpr.java index 7253b31..5945aab 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/MathExpr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/MathExpr.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import java.io.IOException; +import java.io.OutputStream; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; /** @@ -51,6 +53,74 @@ public static double sign(long v) { return v >= 0 ? 1.0 : -1.0; } + // for casting integral types to boolean + public static long toBool(long v) { + return v == 0 ? 0 : 1; + } + + // for casting floating point types to boolean + public static long toBool(double v) { + return v == 0.0D ? 0L : 1L; + } + + /* Convert an integer value in miliseconds since the epoch to a timestamp value + * for use in a long column vector, which is represented in nanoseconds since the epoch. + */ + public static long longToTimestamp(long v) { + return v * 1000000; + } + + // Convert seconds since the epoch (with fraction) to nanoseconds, as a long integer. + public static long doubleToTimestamp(double v) { + return (long)( v * 1000000000.0); + } + + /* Convert an integer value representing a timestamp in nanoseconds to one + * that represents a timestamp in seconds (since the epoch). + */ + public static long fromTimestamp(long v) { + return v / 1000000000; + } + + /* Convert an integer value representing a timestamp in nanoseconds to one + * that represents a timestamp in seconds, with fraction, since the epoch. + */ + public static double fromTimestampToDouble(long v) { + return ((double) v) / 1000000000.0; + } + + /* Convert a long to a string. The string is output into the argument + * byte array, beginning at character 0. The length is returned. + */ + public static int writeLongToUTF8(byte[] result, long i) { + if (i == 0) { + result[0] = '0'; + return 1; + } + + int current = 0; + + if (i < 0) { + result[current++] ='-'; + } else { + // negative range is bigger than positive range, so there is no risk + // of overflow here. + i = -i; + } + + long start = 1000000000000000000L; + while (i / start == 0) { + start /= 10; + } + + while (start > 0) { + result[current++] = (byte) ('0' - (i / start % 10)); + start /= 10; + } + + return current; + } + // Convert all NaN values in vector v to NULL. Should only be used if n > 0. public static void NaNToNull(DoubleColumnVector v, int[] sel, boolean selectedInUse, int n) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 54e9a03..96c51fb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -107,8 +107,15 @@ import org.apache.hadoop.hive.ql.udf.UDFSqrt; import org.apache.hadoop.hive.ql.udf.UDFSubstr; import org.apache.hadoop.hive.ql.udf.UDFTan; +import org.apache.hadoop.hive.ql.udf.UDFToBoolean; +import org.apache.hadoop.hive.ql.udf.UDFToByte; +import org.apache.hadoop.hive.ql.udf.UDFToDouble; +import org.apache.hadoop.hive.ql.udf.UDFToFloat; +import org.apache.hadoop.hive.ql.udf.UDFToInteger; +import org.apache.hadoop.hive.ql.udf.UDFToLong; +import org.apache.hadoop.hive.ql.udf.UDFToShort; +import org.apache.hadoop.hive.ql.udf.UDFToString; import org.apache.hadoop.hive.ql.udf.UDFTrim; -import org.apache.hadoop.hive.ql.udf.UDFUnhex; import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; @@ -127,6 +134,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUpper; @@ -222,6 +230,17 @@ public Vectorizer() { supportedGenericUDFs.add(GenericUDFConcat.class); supportedGenericUDFs.add(GenericUDFAbs.class); + // For type casts + supportedGenericUDFs.add(UDFToLong.class); + supportedGenericUDFs.add(UDFToInteger.class); + supportedGenericUDFs.add(UDFToShort.class); + supportedGenericUDFs.add(UDFToByte.class); + supportedGenericUDFs.add(UDFToBoolean.class); + supportedGenericUDFs.add(UDFToFloat.class); + supportedGenericUDFs.add(UDFToDouble.class); + supportedGenericUDFs.add(UDFToString.class); + supportedGenericUDFs.add(GenericUDFTimestamp.class); + supportedAggregationUdfs.add("min"); supportedAggregationUdfs.add("max"); supportedAggregationUdfs.add("count"); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java new file mode 100644 index 0000000..6315975 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java @@ -0,0 +1,182 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.UnsupportedEncodingException; +import java.util.Arrays; + +import junit.framework.Assert; + +import org.apache.hadoop.hbase.client.coprocessor.Batch; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.*; +import org.apache.hadoop.hive.ql.exec.vector.expressions.*; +import org.junit.Test; + +/** + * Test VectorExpression classes for vectorized implementations of type casts. + */ +public class TestVectorTypeCasts { + + // Number of nanoseconds in one second + private static final long NANOS_PER_SECOND = 1000000000; + + // Number of microseconds in one second + private static final long MICROS_PER_SECOND = 1000000; + + @Test + public void testVectorCastLongToDouble() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchLongInDoubleOut(); + DoubleColumnVector resultV = (DoubleColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastLongToDouble(0, 1); + expr.evaluate(b); + Assert.assertEquals(2.0, resultV.vector[4]); + } + + @Test + public void testVectorCastDoubleToLong() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchDoubleInLongOut(); + LongColumnVector resultV = (LongColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastDoubleToLong(0, 1); + expr.evaluate(b); + Assert.assertEquals(1, resultV.vector[6]); + } + + @Test + public void testCastDoubleToBoolean() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchDoubleInLongOut(); + LongColumnVector resultV = (LongColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastDoubleToBooleanViaDoubleToLong(0, 1); + expr.evaluate(b); + Assert.assertEquals(0, resultV.vector[3]); + Assert.assertEquals(1, resultV.vector[4]); + } + + @Test + public void testCastDoubleToTimestamp() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchDoubleInLongOut(); + LongColumnVector resultV = (LongColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastDoubleToTimestampViaDoubleToLong(0, 1); + expr.evaluate(b); + Assert.assertEquals(0, resultV.vector[3]); + Assert.assertEquals((long) (0.5d * NANOS_PER_SECOND), resultV.vector[4]); + } + + @Test + public void testCastLongToBoolean() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchLongInLongOut(); + LongColumnVector inV = (LongColumnVector) b.cols[0]; + inV.vector[0] = 0; // make one entry produce false in result + LongColumnVector resultV = (LongColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastLongToBooleanViaLongToLong(0, 1); + expr.evaluate(b); + Assert.assertEquals(0, resultV.vector[0]); + Assert.assertEquals(1, resultV.vector[1]); + } + + @Test + public void testCastLongToTimestamp() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchLongInLongOut(); + LongColumnVector resultV = (LongColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastLongToTimestampViaLongToLong(0, 1); + expr.evaluate(b); + Assert.assertEquals(-2 * MICROS_PER_SECOND, resultV.vector[0]); + Assert.assertEquals(2 * MICROS_PER_SECOND, resultV.vector[1]); + } + + @Test + public void testCastTimestampToLong() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchLongInLongOut(); + LongColumnVector inV = (LongColumnVector) b.cols[0]; + inV.vector[0] = NANOS_PER_SECOND; // Make one entry produce interesting result + // (1 sec after epoch). + + LongColumnVector resultV = (LongColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastTimestampToLongViaLongToLong(0, 1); + expr.evaluate(b); + Assert.assertEquals(1, resultV.vector[0]); + } + + @Test + public void testCastTimestampToDouble() { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchLongInDoubleOut(); + LongColumnVector inV = (LongColumnVector) b.cols[0]; + DoubleColumnVector resultV = (DoubleColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastTimestampToDoubleViaLongToDouble(0, 1); + expr.evaluate(b); + Assert.assertEquals(-1E-9D , resultV.vector[1]); + Assert.assertEquals(1E-9D, resultV.vector[3]); + } + + public byte[] toBytes(String s) { + byte[] b = null; + try { + b = s.getBytes("UTF-8"); + } catch (Exception e) { + throw new RuntimeException("Could not convert string to UTF-8 byte array."); + } + return b; + } + + @Test + public void testCastLongToString() { + VectorizedRowBatch b = TestVectorMathFunctions.getBatchForStringMath(); + BytesColumnVector resultV = (BytesColumnVector) b.cols[2]; + b.cols[1].noNulls = true; + VectorExpression expr = new CastLongToString(1, 2); + expr.evaluate(b); + byte[] num255 = toBytes("255"); + Assert.assertEquals(0, + StringExpr.compare(num255, 0, num255.length, + resultV.vector[1], resultV.start[1], resultV.length[1])); + } + + @Test + public void testCastBooleanToString() { + byte[] t = toBytes("TRUE"); + byte[] f = toBytes("FALSE"); + VectorizedRowBatch b = TestVectorMathFunctions.getBatchForStringMath(); + LongColumnVector inV = (LongColumnVector) b.cols[1]; + BytesColumnVector resultV = (BytesColumnVector) b.cols[2]; + inV.vector[1] = 1; + VectorExpression expr = new CastBooleanToStringViaLongToString(1, 2); + expr.evaluate(b); + Assert.assertEquals(0, + StringExpr.compare(f, 0, f.length, + resultV.vector[0], resultV.start[0], resultV.length[0])); + Assert.assertEquals(0, + StringExpr.compare(t, 0, t.length, + resultV.vector[1], resultV.start[1], resultV.length[1])); + } +}