diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index cbd0030..bdcb1cb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -76,6 +76,7 @@ import org.apache.hadoop.hive.ql.udf.UDFConcat; import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth; import org.apache.hadoop.hive.ql.udf.UDFHour; +import org.apache.hadoop.hive.ql.udf.UDFLTrim; import org.apache.hadoop.hive.ql.udf.UDFLength; import org.apache.hadoop.hive.ql.udf.UDFLike; import org.apache.hadoop.hive.ql.udf.UDFLower; @@ -88,7 +89,9 @@ import org.apache.hadoop.hive.ql.udf.UDFOPNegative; import org.apache.hadoop.hive.ql.udf.UDFOPPlus; import org.apache.hadoop.hive.ql.udf.UDFOPPositive; +import org.apache.hadoop.hive.ql.udf.UDFRTrim; import org.apache.hadoop.hive.ql.udf.UDFSecond; +import org.apache.hadoop.hive.ql.udf.UDFTrim; import org.apache.hadoop.hive.ql.udf.UDFUpper; import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; @@ -466,6 +469,12 @@ private VectorExpression getVectorExpression(GenericUDFBridge udf, return getUnaryStringExpression("StringUpper", "String", childExpr); } else if (cl.equals(UDFLength.class)) { return getUnaryStringExpression("StringLength", "Long", childExpr); + } else if (cl.equals(UDFLTrim.class)) { + return getUnaryStringExpression("StringLTrim", "String", childExpr); + } else if (cl.equals(UDFRTrim.class)) { + return getUnaryStringExpression("StringRTrim", "String", childExpr); + } else if (cl.equals(UDFTrim.class)) { + return getUnaryStringExpression("StringTrim", "String", childExpr); } else if (cl.equals(UDFConcat.class)) { return getConcatExpression(childExpr); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLTrim.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLTrim.java new file mode 100644 index 0000000..30b6dbf --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLTrim.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +public class StringLTrim extends StringUnaryUDFDirect { + public StringLTrim(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + } + + /** + * LTRIM element i of the vector, and place the result in outV. + * Operate on the data in place, and set the output by reference + * to improve performance. Ignore null handling. That will be handled separately. + */ + protected void func(BytesColumnVector outV, byte[][] vector, int[] start, int[] length, int i) { + int j = start[i]; + + // skip past blank characters + while(j < start[i] + vector[i].length && vector[i][j] == 0x20) { + j++; + } + + outV.setVal(i, vector[i], j, length[i] - (j - start[i])); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringRTrim.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringRTrim.java new file mode 100644 index 0000000..b806cab --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringRTrim.java @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +public class StringRTrim extends StringUnaryUDFDirect { + public StringRTrim(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + } + + /** + * RTRIM element i of the vector, and place the result in outV. + * Operate on the data in place, and set the output by reference + * to improve performance. Ignore null handling. That will be handled separately. + */ + protected void func(BytesColumnVector outV, byte[][] vector, int[] start, int[] length, int i) { + int j = start[i] + length[i] - 1; + + // skip trailing blank characters + while(j >= start[i] && vector[i][j] == 0x20) { + j--; + } + + // set output vector + outV.setVal(i, vector[i], start[i], (j - start[i]) + 1); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringTrim.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringTrim.java new file mode 100644 index 0000000..4e6fc5e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringTrim.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +public class StringTrim extends StringUnaryUDFDirect { + public StringTrim(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + } + + /** + * TRIM element i of the vector, eliminating blanks from the left + * and right sides of the string, and place the result in outV. + * Operate on the data in place, and set the output by reference + * to improve performance. Ignore null handling. That will be handled separately. + */ + protected void func(BytesColumnVector outV, byte[][] vector, int[] start, int[] length, int i) { + int l = start[i]; + int r = start[i] + length[i] - 1; + + // skip blank character on left + while(l <= r && vector[i][l] == 0x20) { + l++; + } + + // skip blank characters on right + while(l <= r && vector[i][r] == 0x20) { + r--; + } + + outV.setVal(i, vector[i], l, (r - l) + 1); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDFDirect.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDFDirect.java new file mode 100644 index 0000000..dfaaf0c --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDFDirect.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * This is a superclass for unary string functions that operate directly on the + * input and set the output. It is suitable for direct, in-place operations on + * strings, such as for fast implementations of TRIM(), LTRIM(), and RTRIM(). + */ +abstract public class StringUnaryUDFDirect extends VectorExpression { + int inputColumn; + int outputColumn; + + public StringUnaryUDFDirect(int inputColumn, int outputColumn) { + this.inputColumn = inputColumn; + this.outputColumn = outputColumn; + } + + abstract protected void func(BytesColumnVector outV, byte[][] vector, int[] start, int[] length, int i); + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[inputColumn]; + int[] sel = batch.selected; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int start[] = inputColVector.start; + int length[] = inputColVector.length; + BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn]; + outV.initBuffer(); + + if (n == 0) { + //Nothing to do + return; + } + + if (inputColVector.noNulls) { + outV.noNulls = true; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + func(outV, vector, start, length, 0); + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + func(outV, vector, start, length, i); + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + func(outV, vector, start, length, i); + } + outV.isRepeating = false; + } + } else { + + // Handle case with nulls. Don't do function if the value is null, + // because the data may be undefined for a null value. + outV.noNulls = false; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.isNull[0] = inputColVector.isNull[0]; + if (!inputColVector.isNull[0]) { + func(outV, vector, start, length, 0); + } + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.isNull[i] = inputColVector.isNull[i]; + if (!inputColVector.isNull[i]) { + func(outV, vector, start, length, i); + } + } + outV.isRepeating = false; + } else { + System.arraycopy(inputColVector.isNull, 0, outV.isNull, 0, n); + for(int i = 0; i != n; i++) { + if (!inputColVector.isNull[i]) { + func(outV, vector, start, length, i); + } + } + outV.isRepeating = false; + } + } + } + + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "String"; + } + +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java index 5728df0..2768887 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java @@ -34,7 +34,7 @@ * This is the primary method to implement expression logic. * @param vrg */ - public abstract void evaluate(VectorizedRowBatch vrg); + public abstract void evaluate(VectorizedRowBatch batch); /** * Returns the index of the output column in the array diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index e565433..0084558 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -55,6 +55,10 @@ private static byte[] mixedUpUpper; private static byte[] multiByte; private static byte[] mixPercentPattern; + private static byte[] blanksLeft; + private static byte[] blanksRight; + private static byte[] blanksBoth; + private static byte[] blankString; static { try { @@ -72,6 +76,10 @@ mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE multiByte = new byte[100]; addMultiByteChars(multiByte); + blanksLeft = " foo".getBytes("UTF-8"); + blanksRight = "foo ".getBytes("UTF-8"); + blanksBoth = " foo ".getBytes("UTF-8"); + blankString = " ".getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } @@ -1405,4 +1413,81 @@ public void testSubstrStartLen() throws UnsupportedEncodingException { ) ); } + + @Test + public void testVectorLTrim() { + VectorizedRowBatch b = makeTrimBatch(); + VectorExpression expr = new StringLTrim(0, 1); + expr.evaluate(b); + BytesColumnVector outV = (BytesColumnVector) b.cols[1]; + Assert.assertEquals(0, + StringExpr.compare(emptyString, 0, 0, outV.vector[0], 0, 0)); + Assert.assertEquals(0, + StringExpr.compare(blanksLeft, 2, 3, outV.vector[1], outV.start[1], outV.length[1])); + Assert.assertEquals(0, + StringExpr.compare(blanksRight, 0, 5, outV.vector[2], outV.start[2], outV.length[2])); + Assert.assertEquals(0, + StringExpr.compare(blanksBoth, 2, 5, outV.vector[3], outV.start[3], outV.length[3])); + Assert.assertEquals(0, + StringExpr.compare(red, 0, 3, outV.vector[4], outV.start[4], outV.length[4])); + Assert.assertEquals(0, + StringExpr.compare(blankString, 0, 0, outV.vector[5], outV.start[5], outV.length[5])); + } + + @Test + public void testVectorRTrim() { + VectorizedRowBatch b = makeTrimBatch(); + VectorExpression expr = new StringRTrim(0, 1); + expr.evaluate(b); + BytesColumnVector outV = (BytesColumnVector) b.cols[1]; + Assert.assertEquals(0, + StringExpr.compare(emptyString, 0, 0, outV.vector[0], 0, 0)); + Assert.assertEquals(0, + StringExpr.compare(blanksLeft, 0, 5, outV.vector[1], outV.start[1], outV.length[1])); + Assert.assertEquals(0, + StringExpr.compare(blanksRight, 0, 3, outV.vector[2], outV.start[2], outV.length[2])); + Assert.assertEquals(0, + StringExpr.compare(blanksBoth, 0, 5, outV.vector[3], outV.start[3], outV.length[3])); + Assert.assertEquals(0, + StringExpr.compare(red, 0, 3, outV.vector[4], outV.start[4], outV.length[4])); + Assert.assertEquals(0, + StringExpr.compare(blankString, 0, 0, outV.vector[5], outV.start[5], outV.length[5])); + } + + @Test + public void testVectorTrim() { + VectorizedRowBatch b = makeTrimBatch(); + VectorExpression expr = new StringTrim(0, 1); + expr.evaluate(b); + BytesColumnVector outV = (BytesColumnVector) b.cols[1]; + Assert.assertEquals(0, + StringExpr.compare(emptyString, 0, 0, outV.vector[0], 0, 0)); + Assert.assertEquals(0, + StringExpr.compare(blanksLeft, 2, 3, outV.vector[1], outV.start[1], outV.length[1])); + Assert.assertEquals(0, + StringExpr.compare(blanksRight, 0, 3, outV.vector[2], outV.start[2], outV.length[2])); + Assert.assertEquals(0, + StringExpr.compare(blanksBoth, 2, 3, outV.vector[3], outV.start[3], outV.length[3])); + Assert.assertEquals(0, + StringExpr.compare(red, 0, 3, outV.vector[4], outV.start[4], outV.length[4])); + Assert.assertEquals(0, + StringExpr.compare(blankString, 0, 0, outV.vector[5], outV.start[5], outV.length[5])); + } + + // Make a batch to test the trim functions. + private VectorizedRowBatch makeTrimBatch() { + VectorizedRowBatch b = new VectorizedRowBatch(2); + BytesColumnVector inV = new BytesColumnVector(); + BytesColumnVector outV = new BytesColumnVector(); + b.cols[0] = inV; + b.cols[1] = outV; + inV.setRef(0, emptyString, 0, 0); + inV.setRef(1, blanksLeft, 0, blanksLeft.length); + inV.setRef(2, blanksRight, 0, blanksRight.length); + inV.setRef(3, blanksBoth, 0, blanksBoth.length); + inV.setRef(4, red, 0, red.length); + inV.setRef(5, blankString, 0, blankString.length); + b.size = 5; + return b; + } }