diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java new file mode 100644 index 0000000..90dd7da --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +/** + * String expression evaluation helper functions + */ +public class StringExpr { + + /* Compare two strings from two byte arrays each + * with their own start position and length. + * Use lexicographic unsigned byte value order. + * This is what's used for UTF-8 sort order. + * Return negative value if arg1 < arg2, 0 if arg1 = arg2, + * positive if arg1 > arg2. + */ + public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) { + for (int i = 0; i < len1 && i < len2; i++) { + int b1 = arg1[i + start1] & 0xff; + int b2 = arg2[i + start2] & 0xff; + if (b1 != b2) { + return b1 - b2; + } + } + return len1 - len2; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java new file mode 100644 index 0000000..05e1f40 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +public class StringLength extends VectorExpression { + private int colNum; + private int outputColumn; + + StringLength (int colNum, int outputColumn) { + this.colNum = colNum; + this.outputColumn = outputColumn; + } + + // Calculate the length of the UTF-8 strings in input vector and place results in output vector. + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn]; + int[] sel = batch.selected; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int start[] = inputColVector.start; + int length[] = inputColVector.length; + long[] resultLen = outV.vector; + + if (n == 0) { + + //Nothing to do + return; + } + + if (inputColVector.noNulls) { + outV.noNulls = true; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + resultLen[0] = UTF8StringLength(vector[0], start[0], length[0]); + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]); + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]); + } + outV.isRepeating = false; + } + } else { + + /* + * Handle case with nulls. Don't do function if the value is null, to save time, + * because calling the function can be expensive. + */ + outV.noNulls = false; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.isNull[0] = inputColVector.isNull[0]; + if (!inputColVector.isNull[0]) { + resultLen[0] = UTF8StringLength(vector[0], start[0], length[0]); + } + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inputColVector.isNull[i]) { + resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + if (!inputColVector.isNull[i]) { + resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } + } + } + + /* + * Return length in characters of UTF8 string in byte array + * beginning at start that is len bytes long. + */ + static long UTF8StringLength(byte[] s, int start, int len) + { + long resultLength = 0; + for (int i = start; i < start + len; i++) { + + /* Byte bit patterns of the form 10xxxxxx are continuation + * bytes. All other bit patterns are the first byte of + * a character. + */ + if ((s[i] & 0xc0) != 0x80) { + resultLength++; + } + } + return resultLength; + } + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "String"; + } + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java new file mode 100644 index 0000000..5a9694f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.udf.UDFLower; +import org.apache.hadoop.hive.ql.udf.IUDFUnaryString; + +public class StringLower extends StringUnaryUDF { + StringLower(int colNum, int outputColumn) { + super(colNum, outputColumn, (IUDFUnaryString) new UDFLower()); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java new file mode 100644 index 0000000..503011f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.hive.ql.udf.IUDFUnaryString; + +public class StringUnaryUDF extends VectorExpression { + + int colNum; + int outputColumn; + IUDFUnaryString func; + Text s; + + StringUnaryUDF (int colNum, int outputColumn, IUDFUnaryString func) { + this.colNum = colNum; + this.outputColumn = outputColumn; + this.func = func; + s = new Text(); + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int start[] = inputColVector.start; + int length[] = inputColVector.length; + BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn]; + byte[][] outputVector = outV.vector; + Text t; + + if (n == 0) { + //Nothing to do + return; + } + + // Design Note: In the future, if this function can be implemented + // directly to translate input to output without creating new + // objects, performance can probably be improved significantly. + // It's implemented in the simplest way now, just calling the + // existing built-in function. + + if (inputColVector.noNulls) { + outV.noNulls = true; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + s.set(vector[0], start[0], length[0]); + t = func.evaluate(s); + outV.setRef(0, t.getBytes(), 0, t.getLength()); + } else if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + s.set(vector[i], start[i], length[i]); + t = func.evaluate(s); + outV.setRef(i, t.getBytes(), 0, t.getLength()); + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + s.set(vector[i], start[i], length[i]); + t = func.evaluate(s); + outV.setRef(i, t.getBytes(), 0, t.getLength()); + } + outV.isRepeating = false; + } + } else { + // Handle case with nulls. Don't do function if the value is null, to save time, + // because calling the function can be expensive. + outV.noNulls = false; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.isNull[0] = inputColVector.isNull[0]; + if (!inputColVector.isNull[0]) { + s.set(vector[0], start[0], length[0]); + t = func.evaluate(s); + outV.setRef(0, t.getBytes(), 0, t.getLength()); + } + } else if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!inputColVector.isNull[i]) { + s.set(vector[i], start[i], length[i]); + t = func.evaluate(s); + outV.setRef(i, t.getBytes(), 0, t.getLength()); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + if (!inputColVector.isNull[i]) { + s.set(vector[i], start[i], length[i]); + t = func.evaluate(s); + outV.setRef(i, t.getBytes(), 0, t.getLength()); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } + } + } + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "String"; + } + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java new file mode 100644 index 0000000..bf1c19e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.udf.UDFUpper; +import org.apache.hadoop.hive.ql.udf.IUDFUnaryString; + +public class StringUpper extends StringUnaryUDF { + StringUpper(int colNum, int outputColumn) { + super(colNum, outputColumn, (IUDFUnaryString) new UDFUpper()); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/templates/CodeGen.class b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/templates/CodeGen.class index 8f84a5f..55703ce 100644 Binary files a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/templates/CodeGen.class and b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/templates/CodeGen.class differ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java new file mode 100644 index 0000000..017a89e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java @@ -0,0 +1,10 @@ +package org.apache.hadoop.hive.ql.udf; + +import org.apache.hadoop.io.Text; + +/** + * Interface to support use of standard UDFs inside the vectorized execution code path. + */ +public interface IUDFUnaryString { + Text evaluate(Text s); +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java index f79cbdf..9bdef27 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java @@ -30,7 +30,7 @@ value = "_FUNC_(str) - Returns str with all characters changed to lowercase", extended = "Example:\n" + " > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + " 'facebook'") -public class UDFLower extends UDF { +public class UDFLower extends UDF implements IUDFUnaryString { private Text t = new Text(); public UDFLower() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java index 7dc682b..6b1bf66 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java @@ -30,7 +30,7 @@ value = "_FUNC_(str) - Returns str with all characters changed to uppercase", extended = "Example:\n" + " > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + " 'FACEBOOK'") -public class UDFUpper extends UDF { +public class UDFUpper extends UDF implements IUDFUnaryString { Text t = new Text(); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java new file mode 100644 index 0000000..fe34b11 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -0,0 +1,286 @@ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import junit.framework.Assert; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.junit.Test; + +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import org.apache.hadoop.io.Text; + +public class TestVectorStringExpressions { + + static byte[] red; + static byte[] red2; // second copy of red, different object + static byte[] green; + static byte[] emptyString; + static byte[] mixedUp; + static byte[] mixedUpLower; + static byte[] mixedUpUpper; + static byte[] multiByte; + static byte[] mixPercentPattern; + + static { + try { + red = "red".getBytes("UTF-8"); + green = "green".getBytes("UTF-8"); + emptyString = "".getBytes("UTF-8"); + mixedUp = "mixedUp".getBytes("UTF-8"); + mixedUpLower = "mixedup".getBytes("UTF-8"); + mixedUpUpper = "MIXEDUP".getBytes("UTF-8"); + mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE + multiByte = new byte[100]; + addMultiByteChars(multiByte); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + red2 = new byte[red.length]; + System.arraycopy(red, 0, red2, 0, red.length); + } + + // add some multi-byte characters to test length routine later. + // total characters = 4; byte length = 10 + static void addMultiByteChars(byte[] b) { + int i = 0; + b[i++] = (byte) 0x41; // letter "A" (1 byte) + b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes) + b[i++] = (byte) 0x80; + b[i++] = (byte) 0xE2; // Euro sign (3 bytes) + b[i++] = (byte) 0x82; + b[i++] = (byte) 0xAC; + b[i++] = (byte) 0xF0; // Asian character U+24B62 (4 bytes) + b[i++] = (byte) 0xA4; + b[i++] = (byte) 0xAD; + b[i++] = (byte) 0xA2; + } + + @Test + // Load a BytesColumnVector by copying in large data, enough to force + // the buffer to expand. + public void testLoadBytesColumnVectorByValueLargeData() { + BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + bcv.initBuffer(10); // initialize with estimated element size 10 + String s = "0123456789"; + while (s.length() < 500) { + s += s; + } + byte[] b = null; + try { + b = s.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) { + bcv.setVal(i, b, 0, b.length); + } + Assert.assertTrue(bcv.bufferSize() >= b.length * VectorizedRowBatch.DEFAULT_SIZE); + } + + @Test + // set values by reference, copy the data out, and verify equality + public void testLoadBytesColumnVectorByRef() { + BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + String s = "red"; + byte[] b = null; + try { + b = s.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) { + bcv.setRef(i, b, 0, b.length); + } + // verify + byte[] v = new byte[b.length]; + for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) { + Assert.assertTrue(bcv.length[i] == b.length); + System.arraycopy(bcv.vector[i], bcv.start[i], v, 0, b.length); + Assert.assertTrue(Arrays.equals(b, v)); + } + } + + VectorizedRowBatch makeStringBatch() { + // create a batch with one string ("Bytes") column + VectorizedRowBatch batch = new VectorizedRowBatch(1,VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + /* + * Add these 3 values: + * + * red + * green + * NULL + */ + v.setRef(0, red, 0, red.length); + v.isNull[0] = false; + v.setRef(1, green, 0, green.length); + v.isNull[1] = false; + v.setRef(2, emptyString, 0, emptyString.length); + v.isNull[2] = true; + + v.noNulls = false; + + batch.size = 3; + return batch; + } + + VectorizedRowBatch makeStringBatchMixedCase() { + // create a batch with two string ("Bytes") columns + VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + BytesColumnVector outV = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[1] = outV; + /* + * Add these 3 values: + * + * mixedUp + * green + * NULL + */ + v.setRef(0, mixedUp, 0, mixedUp.length); + v.isNull[0] = false; + v.setRef(1, green, 0, green.length); + v.isNull[1] = false; + v.setRef(2, emptyString, 0, emptyString.length); + v.isNull[2] = true; + v.noNulls = false; + + batch.size = 3; + return batch; + } + + VectorizedRowBatch makeStringBatchMixedCharSize() { + // create a new batch with one char column (for input) + // and one long column (for output) + VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[1] = outV; + + /* + * Add these 3 values: + * + * mixedUp + * green + * NULL + * <4 char string with mult-byte chars> + */ + v.setRef(0, mixedUp, 0, mixedUp.length); + v.isNull[0] = false; + v.setRef(1, green, 0, green.length); + v.isNull[1] = false; + v.setRef(2, emptyString, 0, emptyString.length); + v.isNull[2] = true; + v.noNulls = false; + v.setRef(3, multiByte, 0, 10); + v.isNull[3] = false; + + batch.size = 4; + return batch; + } + + @Test + public void testColLower() { + // has nulls, not repeating + VectorizedRowBatch batch = makeStringBatchMixedCase(); + StringLower expr = new StringLower(0,1); + expr.evaluate(batch); + BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; + int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); + Assert.assertEquals(0,cmp); + Assert.assertTrue(outCol.isNull[2]); + int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], outCol.start[1], outCol.length[1]); + Assert.assertEquals(0,cmp2); + + // no nulls, not repeating + batch = makeStringBatchMixedCase(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); + Assert.assertEquals(0,cmp); + Assert.assertTrue(outCol.noNulls); + + // has nulls, is repeating + batch = makeStringBatchMixedCase(); + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); + Assert.assertEquals(0,cmp); + Assert.assertTrue(outCol.isRepeating); + Assert.assertFalse(outCol.noNulls); + + // no nulls, is repeating + batch = makeStringBatchMixedCase(); + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); + Assert.assertEquals(0,cmp); + Assert.assertTrue(outCol.isRepeating); + Assert.assertTrue(outCol.noNulls); + } + + @Test + public void testColUpper() { + // no nulls, not repeating + + // We don't test all the combinations because (at least currently) + // the logic is inherited to be the same as testColLower, which checks all the cases). + VectorizedRowBatch batch = makeStringBatchMixedCase(); + StringUpper expr = new StringUpper(0,1); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; + int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], outCol.start[0], outCol.length[0]); + Assert.assertEquals(0,cmp); + Assert.assertTrue(outCol.noNulls); + } + + @Test + public void testStringLength() { + + // has nulls, not repeating + VectorizedRowBatch batch = makeStringBatchMixedCharSize(); + StringLength expr = new StringLength(0,1); + expr.evaluate(batch); + LongColumnVector outCol = (LongColumnVector) batch.cols[1]; + Assert.assertEquals(5,outCol.vector[1]); // length of green is 5 + Assert.assertTrue(outCol.isNull[2]); + Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (LongColumnVector) batch.cols[1]; + Assert.assertTrue(outCol.noNulls); + Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + outCol = (LongColumnVector) batch.cols[1]; + Assert.assertTrue(outCol.isRepeating); + Assert.assertFalse(outCol.noNulls); + Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp" + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (LongColumnVector) batch.cols[1]; + Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp" + Assert.assertTrue(outCol.isRepeating); + Assert.assertTrue(outCol.noNulls); + } +}