diff --git a/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java b/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java index 2ee0451..a9abb2e 100644 --- a/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java +++ b/ant/src/org/apache/hadoop/hive/ant/GenVectorCode.java @@ -215,6 +215,9 @@ {"FilterStringColumnCompareScalar", "Greater", ">"}, {"FilterStringColumnCompareScalar", "GreaterEqual", ">="}, + {"FilterStringColumnBetween", ""}, + {"FilterStringColumnBetween", "!"}, + {"StringColumnCompareScalar", "Equal", "=="}, {"StringColumnCompareScalar", "NotEqual", "!="}, {"StringColumnCompareScalar", "Less", "<"}, @@ -276,6 +279,11 @@ {"FilterColumnCompareColumn", "GreaterEqual", "long", "long", ">="}, {"FilterColumnCompareColumn", "GreaterEqual", "double", "long", ">="}, + {"FilterColumnBetween", "long", ""}, + {"FilterColumnBetween", "double", ""}, + {"FilterColumnBetween", "long", "!"}, + {"FilterColumnBetween", "double", "!"}, + {"ColumnCompareColumn", "Equal", "long", "double", "=="}, {"ColumnCompareColumn", "Equal", "double", "double", "=="}, {"ColumnCompareColumn", "NotEqual", "long", "double", "!="}, @@ -511,6 +519,8 @@ private void generate() throws Exception { generateFilterColumnCompareScalar(tdesc); } else if (tdesc[0].equals("FilterScalarCompareColumn")) { generateFilterScalarCompareColumn(tdesc); + } else if (tdesc[0].equals("FilterColumnBetween")) { + generateFilterColumnBetween(tdesc); } else if (tdesc[0].equals("ScalarArithmeticColumn")) { generateScalarArithmeticColumn(tdesc); } else if (tdesc[0].equals("FilterColumnCompareColumn")) { @@ -535,6 +545,8 @@ private void generate() throws Exception { generateVectorUDAFVar(tdesc); } else if (tdesc[0].equals("FilterStringColumnCompareScalar")) { generateFilterStringColumnCompareScalar(tdesc); + } else if (tdesc[0].equals("FilterStringColumnBetween")) { + generateFilterStringColumnBetween(tdesc); } else if (tdesc[0].equals("StringColumnCompareScalar")) { generateStringColumnCompareScalar(tdesc); } else if (tdesc[0].equals("FilterStringScalarCompareColumn")) { @@ -553,6 +565,40 @@ private void generate() throws Exception { testCodeGen.generateTestSuites(); } + private void generateFilterStringColumnBetween(String[] tdesc) throws IOException { + String optionalNot = tdesc[1]; + String className = "FilterStringColumn" + (optionalNot.equals("!") ? "Not" : "") + + "Between"; + String outputFile = joinPath(this.expressionOutputDirectory, className + ".java"); + + // Read the template into a string, expand it, and write it. + String templateFile = joinPath(this.expressionTemplateDirectory, tdesc[0] + ".txt"); + String templateString = readFile(templateFile); + templateString = templateString.replaceAll("", className); + templateString = templateString.replaceAll("", optionalNot); + writeFile(outputFile, templateString); + } + + private void generateFilterColumnBetween(String[] tdesc) throws IOException { + String operandType = tdesc[1]; + String optionalNot = tdesc[2]; + + String className = "Filter" + getCamelCaseType(operandType) + "Column" + + (optionalNot.equals("!") ? "Not" : "") + "Between"; + String inputColumnVectorType = getColumnVectorType(operandType); + String outputFile = joinPath(this.expressionOutputDirectory, className + ".java"); + + // Read the template into a string, expand it, and write it. + String templateFile = joinPath(this.expressionTemplateDirectory, tdesc[0] + ".txt"); + String templateString = readFile(templateFile); + templateString = templateString.replaceAll("", className); + templateString = templateString.replaceAll("", inputColumnVectorType); + templateString = templateString.replaceAll("", operandType); + templateString = templateString.replaceAll("", optionalNot); + + writeFile(outputFile, templateString); + } + private void generateColumnCompareColumn(String[] tdesc) throws IOException { //The variables are all same as ColumnCompareScalar except that //this template doesn't need a return type. Pass anything as return type. diff --git a/ql/src/gen/vectorization/ExpressionTemplates/FilterColumnBetween.txt b/ql/src/gen/vectorization/ExpressionTemplates/FilterColumnBetween.txt new file mode 100644 index 0000000..e9aaaf2 --- /dev/null +++ b/ql/src/gen/vectorization/ExpressionTemplates/FilterColumnBetween.txt @@ -0,0 +1,192 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions.gen; + +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + +/** + * Generated from template FilterColumnBetween.txt, which covers [NOT] BETWEEN filter + * expressions where a column is [NOT] between one scalar and another. + * Output is not produced in a separate column. The selected vector of the input + * {@link VectorizedRowBatch} is updated for in-place filtering. + */ +public class extends VectorExpression { + + private static final long serialVersionUID = 1L; + + private int colNum; + + // The comparison is of the form "column BETWEEN leftValue AND rightValue" + private leftValue; + private rightValue; + + public (int colNum, leftValue, rightValue) { + this.colNum = colNum; + this.leftValue = leftValue; + this.rightValue = rightValue; + } + + public () { + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + inputColVector = () batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + [] vector = inputColVector.vector; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero. + // Repeating property will not change. + if ((vector[0] < leftValue || vector[0] > rightValue)) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if ((leftValue <= vector[i] && vector[i] <= rightValue)) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if ((leftValue <= vector[i] && vector[i] <= rightValue)) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero. + // Repeating property will not change. + if (!nullPos[0]) { + if ((vector[0] < leftValue || vector[0] > rightValue)) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if ((leftValue <= vector[i] && vector[i] <= rightValue)) { + sel[newSize++] = i; + } + } + } + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if ((leftValue <= vector[i] && vector[i] <= rightValue)) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } + + public int getColNum() { + return colNum; + } + + public void setColNum(int colNum) { + this.colNum = colNum; + } + + public getLeftValue() { + return leftValue; + } + + public void setLeftValue( value) { + this.leftValue = value; + } + + public getRightValue() { + return rightValue; + } + + public void setRightValue( value) { + this.leftValue = value; + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + return (new VectorExpressionDescriptor.Builder()) + .setMode( + VectorExpressionDescriptor.Mode.FILTER) + .setNumArguments(3) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.getType(""), + VectorExpressionDescriptor.ArgumentType.getType(""), + VectorExpressionDescriptor.ArgumentType.getType("")) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN, + VectorExpressionDescriptor.InputExpressionType.SCALAR, + VectorExpressionDescriptor.InputExpressionType.SCALAR).build(); + } +} diff --git a/ql/src/gen/vectorization/ExpressionTemplates/FilterStringColumnBetween.txt b/ql/src/gen/vectorization/ExpressionTemplates/FilterStringColumnBetween.txt new file mode 100644 index 0000000..8b6e4b7 --- /dev/null +++ b/ql/src/gen/vectorization/ExpressionTemplates/FilterStringColumnBetween.txt @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions.gen; + +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + + +/** + * This is a generated class to evaluate a [NOT] BETWEEN comparison on a vector of strings. + */ +public class extends VectorExpression { + + private static final long serialVersionUID = 1L; + + private int colNum; + private byte[] left; + private byte[] right; + + public (int colNum, byte[] left, byte[] right) { + this.colNum = colNum; + this.left = left; + this.right = right; + } + + public () { + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + if (childExpressions != null) { + super.evaluateChildren(batch); + } + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero. Repeating property will not change. + if ((StringExpr.compare(vector[0], start[0], length[0], left, 0, left.length) < 0 + || StringExpr.compare(right, 0, right.length, vector[0], start[0], length[0]) < 0)) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if ((StringExpr.compare(left, 0, left.length, vector[i], start[i], length[i]) <= 0 + && StringExpr.compare(vector[i], start[i], length[i], right, 0, right.length) <= 0)) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if ((StringExpr.compare(left, 0, left.length, vector[i], start[i], length[i]) <= 0 + && StringExpr.compare(vector[i], start[i], length[i], right, 0, right.length) <= 0)) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + if ((StringExpr.compare(vector[0], start[0], length[0], left, 0, left.length) < 0 + || StringExpr.compare(right, 0, right.length, vector[0], start[0], length[0]) < 0)) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if ((StringExpr.compare(left, 0, left.length, vector[i], start[i], length[i]) <= 0 + && StringExpr.compare(vector[i], start[i], length[i], right, 0, right.length) <= 0)) { + sel[newSize++] = i; + } + } + } + + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if ((StringExpr.compare(left, 0, left.length, vector[i], start[i], length[i]) <= 0 + && StringExpr.compare(vector[i], start[i], length[i], right, 0, right.length) <= 0)) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } + + public int getColNum() { + return colNum; + } + + public void setColNum(int colNum) { + this.colNum = colNum; + } + + public byte[] getLeft() { + return left; + } + + public void setLeft(byte[] value) { + this.left = value; + } + + public byte[] getRight() { + return right; + } + + public void setRight(byte[] value) { + this.right = value; + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + return (new VectorExpressionDescriptor.Builder()) + .setMode( + VectorExpressionDescriptor.Mode.FILTER) + .setNumArguments(3) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.getType("string"), + VectorExpressionDescriptor.ArgumentType.getType("string"), + VectorExpressionDescriptor.ArgumentType.getType("string")) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN, + VectorExpressionDescriptor.InputExpressionType.SCALAR, + VectorExpressionDescriptor.InputExpressionType.SCALAR).build(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 4a9b870..c7a05ee 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.vector; import java.lang.reflect.Constructor; +import java.sql.Timestamp; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -61,6 +62,12 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.CastLongToBooleanViaLongToLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.CastLongToDouble; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.CastTimestampToDoubleViaLongToDouble; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColumnNotBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColumnNotBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFArgDesc; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -242,7 +249,7 @@ public VectorExpression getVectorExpression(ExprNodeDesc exprDesc, Mode mode) th ve = getColumnVectorExpression((ExprNodeColumnDesc) exprDesc, mode); } else if (exprDesc instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc; - if (isCustomUDF(expr) || isLegacyPathUDF(expr)) { + if (isCustomUDF(expr) || isNonVectorizedPathUDF(expr)) { ve = getCustomUDFExpression(expr); } else { ve = getGenericUdfVectorExpression(expr.getGenericUDF(), @@ -264,7 +271,7 @@ public VectorExpression getVectorExpression(ExprNodeDesc exprDesc, Mode mode) th * Depending on performance requirements and frequency of use, these * may be implemented in the future with an optimized VectorExpression. */ - public static boolean isLegacyPathUDF(ExprNodeGenericFuncDesc expr) { + public static boolean isNonVectorizedPathUDF(ExprNodeGenericFuncDesc expr) { GenericUDF gudf = expr.getGenericUDF(); if (gudf instanceof GenericUDFBridge) { GenericUDFBridge bridge = (GenericUDFBridge) gudf; @@ -356,6 +363,21 @@ private ExprNodeDesc foldConstantsForUnaryExpression(ExprNodeDesc exprDesc) thro } } + /* Fold simple unary expressions in all members of the input list and return new list + * containing results. + */ + private List foldConstantsForUnaryExprs(List childExpr) + throws HiveException { + List constantFoldedChildren = new ArrayList(); + if (childExpr != null) { + for (ExprNodeDesc expr : childExpr) { + expr = this.foldConstantsForUnaryExpression(expr); + constantFoldedChildren.add(expr); + } + } + return constantFoldedChildren; + } + private VectorExpression getConstantVectorExpression(ExprNodeConstantDesc exprDesc, Mode mode) throws HiveException { String type = exprDesc.getTypeString(); @@ -522,7 +544,9 @@ private VectorExpression instantiateExpression(Class vclass, Object...args) private VectorExpression getGenericUdfVectorExpression(GenericUDF udf, List childExpr, Mode mode) throws HiveException { //First handle special cases - if (udf instanceof GenericUDFBridge) { + if (udf instanceof GenericUDFBetween) { + return getBetweenFilterExpression(childExpr); + } else if (udf instanceof GenericUDFBridge) { VectorExpression v = getGenericUDFBridgeVectorExpression((GenericUDFBridge) udf, childExpr, mode); if (v != null) { return v; @@ -535,13 +559,7 @@ private VectorExpression getGenericUdfVectorExpression(GenericUDF udf, udfClass = ((GenericUDFBridge) udf).getUdfClass(); } - List constantFoldedChildren = new ArrayList(); - if (childExpr != null) { - for (ExprNodeDesc expr : childExpr) { - expr = this.foldConstantsForUnaryExpression(expr); - constantFoldedChildren.add(expr); - } - } + List constantFoldedChildren = foldConstantsForUnaryExprs(childExpr); VectorExpression ve = getVectorExpressionForUdf(udfClass, constantFoldedChildren, mode); if (ve == null) { throw new HiveException("Udf: "+udf.getClass().getSimpleName()+", is not supported"); @@ -638,6 +656,60 @@ private VectorExpression getCastToLongExpression(List childExpr) return null; } + /* Get a [NOT] BETWEEN filter expression. This is treated as a special case + * because the NOT is actually specified in the expression tree as the first argument, + * and we don't want any runtime cost for that. So creating the VectorExpression + * needs to be done differently than the standard way where all arguments are + * passed to the VectorExpression constructor. + */ + private VectorExpression getBetweenFilterExpression(List childExpr) + throws HiveException { + + boolean notKeywordPresent = (Boolean) ((ExprNodeConstantDesc) childExpr.get(0)).getValue(); + ExprNodeDesc colExpr = childExpr.get(1); + + // To hold left and right boundaries as long value in nanos for timestamp type. + long left, right; + List newChildren; + + String colType = colExpr.getTypeString(); + + // prepare arguments for createVectorExpression + List childrenAfterNot = foldConstantsForUnaryExprs(childExpr.subList(1, 4)); + + // determine class + Class cl = null; + if (isIntFamily(colType) && !notKeywordPresent) { + cl = FilterLongColumnBetween.class; + } else if (isIntFamily(colType) && notKeywordPresent) { + cl = FilterLongColumnNotBetween.class; + } else if (isFloatFamily(colType) && !notKeywordPresent) { + cl = FilterDoubleColumnBetween.class; + } else if (isFloatFamily(colType) && notKeywordPresent) { + cl = FilterDoubleColumnNotBetween.class; + } else if (colType.equals("string") && !notKeywordPresent) { + cl = FilterStringColumnBetween.class; + } else if (colType.equals("string") && notKeywordPresent) { + cl = FilterStringColumnNotBetween.class; + } else if (colType.equals("timestamp")) { + + // Get timestamp boundary values as longs instead of the expected strings + left = getTimestampScalar(childExpr.get(2)); + right = getTimestampScalar(childExpr.get(3)); + childrenAfterNot = new ArrayList(); + childrenAfterNot.add(colExpr); + childrenAfterNot.add(new ExprNodeConstantDesc(left)); + childrenAfterNot.add(new ExprNodeConstantDesc(right)); + if (notKeywordPresent) { + cl = FilterLongColumnNotBetween.class; + } else { + cl = FilterLongColumnBetween.class; + } + } + + return createVectorExpression(cl, childrenAfterNot, Mode.PROJECTION); + } + /* * Return vector expression for a custom (i.e. not built-in) UDF. */ @@ -767,6 +839,44 @@ private Object getScalarValue(ExprNodeConstantDesc constDesc) } } + // Get a timestamp as a long in number of nanos, from a string constant. + private long getTimestampScalar(ExprNodeDesc expr) throws HiveException { + if (!(expr instanceof ExprNodeConstantDesc)) { + throw new HiveException("Constant timestamp value expected for expression argument. " + + "Non-constant argument not supported for vectorization."); + } + ExprNodeConstantDesc constExpr = (ExprNodeConstantDesc) expr; + if (constExpr.getTypeString().equals("string")) { + + // create expression tree with type cast from string to timestamp + ExprNodeGenericFuncDesc expr2 = new ExprNodeGenericFuncDesc(); + GenericUDFTimestamp f = new GenericUDFTimestamp(); + expr2.setGenericUDF(f); + ArrayList children = new ArrayList(); + children.add(expr); + expr2.setChildren(children); + + // initialize and evaluate + ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2); + ObjectInspector output = evaluator.initialize(null); + Object constant = evaluator.evaluate(null); + Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output); + + if (!(java instanceof Timestamp)) { + throw new HiveException("Udf: failed to convert from string to timestamp"); + } + Timestamp ts = (Timestamp) java; + long result = ts.getTime(); + result *= 1000000; // shift left 6 digits to make room for nanos below ms precision + result += ts.getNanos() % 1000000; // add in nanos, after removing the ms portion + return result; + } + + throw new HiveException("Udf: unhandled constant type for scalar argument. " + + "Expecting string."); + } + + private Constructor getConstructor(Class cl) throws HiveException { try { Constructor [] ctors = cl.getDeclaredConstructors(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index e698870..a4adb96 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -125,6 +125,7 @@ import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcat; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower; @@ -235,6 +236,7 @@ public Vectorizer() { supportedGenericUDFs.add(GenericUDFUpper.class); supportedGenericUDFs.add(GenericUDFConcat.class); supportedGenericUDFs.add(GenericUDFAbs.class); + supportedGenericUDFs.add(GenericUDFBetween.class); // For type casts supportedGenericUDFs.add(UDFToLong.class); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java index 90ab983..1e49a2e 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java @@ -31,12 +31,18 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.*; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.DoubleColUnaryMinus; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColLessDoubleScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColEqualLongScalar; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColGreaterLongScalar; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColLessDoubleScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongScalarGreaterLongColumn; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColGreaterStringColumn; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColGreaterStringScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FuncLnDoubleToDouble; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FuncRoundDoubleToDouble; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FuncSinDoubleToDouble; @@ -66,6 +72,7 @@ import org.apache.hadoop.hive.ql.udf.UDFSin; import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; @@ -859,4 +866,60 @@ public void testTimeStampUdfs() throws HiveException { ve = vc.getVectorExpression(tsFuncExpr); Assert.assertEquals(VectorUDFUnixTimeStampLong.class, ve.getClass()); } + + @Test + public void testBetweenFilters() throws HiveException { + ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(String.class, "col1", "table", false); + ExprNodeConstantDesc constDesc = new ExprNodeConstantDesc("Alpha"); + ExprNodeConstantDesc constDesc2 = new ExprNodeConstantDesc("Bravo"); + + // string BETWEEN + GenericUDFBetween udf = new GenericUDFBetween(); + ExprNodeGenericFuncDesc exprDesc = new ExprNodeGenericFuncDesc(); + exprDesc.setGenericUDF(udf); + List children1 = new ArrayList(); + children1.add(new ExprNodeConstantDesc(new Boolean(false))); // no NOT keyword + children1.add(col1Expr); + children1.add(constDesc); + children1.add(constDesc2); + exprDesc.setChildren(children1); + + Map columnMap = new HashMap(); + columnMap.put("col1", 1); + columnMap.put("col2", 2); + VectorizationContext vc = new VectorizationContext(columnMap, 2); + VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterStringColumnBetween); + + // string NOT BETWEEN + children1.set(0, new ExprNodeConstantDesc(new Boolean(true))); // has NOT keyword + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterStringColumnNotBetween); + + // long BETWEEN + children1.set(0, new ExprNodeConstantDesc(new Boolean(false))); + children1.set(1, new ExprNodeColumnDesc(Long.class, "col1", "table", false)); + children1.set(2, new ExprNodeConstantDesc(10)); + children1.set(3, new ExprNodeConstantDesc(20)); + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterLongColumnBetween); + + // long NOT BETWEEN + children1.set(0, new ExprNodeConstantDesc(new Boolean(true))); + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterLongColumnNotBetween); + + // double BETWEEN + children1.set(0, new ExprNodeConstantDesc(new Boolean(false))); + children1.set(1, new ExprNodeColumnDesc(Double.class, "col1", "table", false)); + children1.set(2, new ExprNodeConstantDesc(10.0d)); + children1.set(3, new ExprNodeConstantDesc(20.0d)); + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterDoubleColumnBetween); + + // double NOT BETWEEN + children1.set(0, new ExprNodeConstantDesc(new Boolean(true))); + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterDoubleColumnNotBetween); + } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java index 3687c46..24da692 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java @@ -22,14 +22,22 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterDoubleColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColEqualLongScalar; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColGreaterLongColumn; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColGreaterLongScalar; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColLessLongColumn; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongScalarGreaterLongColumn; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterLongScalarLessLongColumn; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColumnBetween; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.LongColAddLongScalar; import org.apache.hadoop.hive.ql.exec.vector.util.VectorizedRowGroupGenUtil; import org.junit.Assert; @@ -278,4 +286,261 @@ public void testFilterLongScalarLessLongColumn() { expr1.evaluate(vrb3); assertEquals(0, vrb3.size); } + + @Test + public void testFilterLongBetween() { + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + LongColumnVector lcv0 = (LongColumnVector) vrb.cols[0]; + VectorExpression expr1 = new FilterLongColumnBetween(0, 15, 17); + + //Basic case + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + expr1.evaluate(vrb); + + assertEquals(2, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(2, vrb.selected[0]); + assertEquals(3, vrb.selected[1]); + + //With nulls + VectorizedRowBatch vrb1 = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + + lcv0 = (LongColumnVector) vrb1.cols[0]; + + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + lcv0.noNulls = false; + lcv0.isNull[0] = true; + lcv0.isNull[2] = true; + + expr1.evaluate(vrb1); + assertEquals(1, vrb1.size); + assertTrue(vrb1.selectedInUse); + assertEquals(3, vrb1.selected[0]); + + //With nulls and selected + VectorizedRowBatch vrb2 = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 7, 2, seed); + vrb2.selectedInUse = true; + vrb2.selected[0] = 1; + vrb2.selected[1] = 2; + vrb2.selected[2] = 4; + vrb2.size = 3; + + lcv0 = (LongColumnVector) vrb2.cols[0]; + + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + lcv0.vector[5] = 19; + lcv0.vector[6] = 21; + + lcv0.noNulls = false; + lcv0.isNull[0] = true; + lcv0.isNull[2] = true; + lcv0.isNull[5] = true; + + expr1.evaluate(vrb2); + assertEquals(0, vrb2.size); + + //Repeating non null + VectorizedRowBatch vrb3 = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 7, 2, seed); + lcv0 = (LongColumnVector) vrb3.cols[0]; + + lcv0.isRepeating = true; + lcv0.vector[0] = 17; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + expr1.evaluate(vrb3); + assertEquals(7, vrb3.size); + assertFalse(vrb3.selectedInUse); + assertTrue(lcv0.isRepeating); + + //Repeating null + lcv0.noNulls = false; + lcv0.vector[0] = 17; + lcv0.isNull[0] = true; + + expr1.evaluate(vrb3); + assertEquals(0, vrb3.size); + + } + + @Test + public void testFilterLongNotBetween() { + + // Spot check only. null & repeating behavior are checked elsewhere for the same template. + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + LongColumnVector lcv0 = (LongColumnVector) vrb.cols[0]; + + //Basic case + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + VectorExpression expr = new FilterLongColumnNotBetween(0, 10, 20); + expr.evaluate(vrb); + assertEquals(1, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(0, vrb.selected[0]); + } + + @Test + public void testFilterDoubleBetween() { + + // Spot check only. null & repeating behavior are checked elsewhere for the same template. + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + DoubleColumnVector dcv0 = new DoubleColumnVector(); + vrb.cols[0] = dcv0; + + //Basic case + dcv0.vector[0] = 5; + dcv0.vector[1] = 20; + dcv0.vector[2] = 17; + dcv0.vector[3] = 15; + dcv0.vector[4] = 10; + + VectorExpression expr = new FilterDoubleColumnBetween(0, 20, 21); + expr.evaluate(vrb); + assertEquals(1, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(1, vrb.selected[0]); + } + + @Test + public void testFilterDoubleNotBetween() { + + // Spot check only. null & repeating behavior are checked elsewhere for the same template. + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + vrb.cols[0] = new DoubleColumnVector(); + DoubleColumnVector dcv = (DoubleColumnVector) vrb.cols[0]; + + //Basic case + dcv.vector[0] = 5; + dcv.vector[1] = 20; + dcv.vector[2] = 17; + dcv.vector[3] = 15; + dcv.vector[4] = 10; + + VectorExpression expr = new FilterDoubleColumnNotBetween(0, 10, 20); + expr.evaluate(vrb); + assertEquals(1, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(0, vrb.selected[0]); + } + + static byte[] a = null; + static byte[] b = null; + static byte[] c = null; + + static { + try { + a = "a".getBytes("UTF-8"); + b = "b".getBytes("UTF-8"); + c = "c".getBytes("UTF-8"); + } catch (Exception e) { + ; // won't happen + } + } + + @Test + public void testFilterStringBetween() { + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 3, 2, seed); + vrb.cols[0] = new BytesColumnVector(); + BytesColumnVector bcv = (BytesColumnVector) vrb.cols[0]; + + bcv.initBuffer(); + bcv.setVal(0, a, 0, 1); + bcv.setVal(1, b, 0, 1); + bcv.setVal(2, c, 0, 1); + + VectorExpression expr = new FilterStringColumnBetween(0, b, c); + + // basic test + expr.evaluate(vrb); + + assertEquals(2, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(1, vrb.selected[0]); + assertEquals(2, vrb.selected[1]); + + // nulls + vrb.selectedInUse = false; + vrb.size = 3; + bcv.noNulls = false; + bcv.isNull[2] = true; + expr.evaluate(vrb); + assertEquals(1, vrb.size); + assertEquals(1, vrb.selected[0]); + assertTrue(vrb.selectedInUse); + + // repeating + vrb.selectedInUse = false; + vrb.size = 3; + bcv.noNulls = true; + bcv.isRepeating = true; + expr.evaluate(vrb); + assertEquals(0, vrb.size); + + // nulls and repeating + vrb.selectedInUse = false; + vrb.size = 3; + bcv.noNulls = false; + bcv.isRepeating = true; + bcv.isNull[0] = true; + bcv.setVal(0, b, 0, 1); + expr.evaluate(vrb); + assertEquals(0, vrb.size); + } + + @Test + public void testFilterStringNotBetween() { + + // Spot check only. Non-standard cases are checked for the same template in another test. + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 3, 2, seed); + vrb.cols[0] = new BytesColumnVector(); + BytesColumnVector bcv = (BytesColumnVector) vrb.cols[0]; + + bcv.initBuffer(); + bcv.setVal(0, a, 0, 1); + bcv.setVal(1, b, 0, 1); + bcv.setVal(2, c, 0, 1); + + VectorExpression expr = new FilterStringColumnNotBetween(0, b, c); + expr.evaluate(vrb); + + assertEquals(1, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(0, vrb.selected[0]); + } }