diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java new file mode 100644 index 0000000..24ba861 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.hive.ql.udf.UDFLike; + +/** + * Evaluate LIKE filter on a batch for a vector of strings. + */ +public class FilterStringColLikeStringScalar extends VectorExpression { + private int colNum; + private Text likePattern; + private Text s; + private UDFLike likeFunc; + + public FilterStringColLikeStringScalar(int colNum, Text likePattern) { + this.colNum = colNum; + this.likePattern = likePattern; + likeFunc = new UDFLike(); + s = new Text(); + } + + /* + * This vectorized version of LIKE calls the standard LIKE + * function code. In the future, as an optimization, consider + * unwinding some of that logic here, e.g. to determine + * if the LIKE pattern is a simple one like 'abc%' so that + * can be executed more efficiently as a special case. + */ + + private boolean like(byte[] bytes, int start, int len) { + s.set(bytes, start, len); + return (likeFunc.evaluate(s, likePattern)).get(); + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero Repeating property will not change. + if (!like(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + //All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + if (!like(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + + /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline + * future operations. So selectedInUse will remain false. + */ + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index fe34b11..aeb3c6c 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.hive.ql.exec.vector.expressions; import junit.framework.Assert; @@ -5,24 +23,31 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColEqualStringScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen. + FilterStringColGreaterEqualStringScalar; import org.junit.Test; +import org.apache.hadoop.io.Text; import java.io.UnsupportedEncodingException; import java.util.Arrays; -import org.apache.hadoop.io.Text; +/** + * Test vectorized expression and filter evaluation for strings. + */ public class TestVectorStringExpressions { - - static byte[] red; - static byte[] red2; // second copy of red, different object - static byte[] green; - static byte[] emptyString; - static byte[] mixedUp; - static byte[] mixedUpLower; - static byte[] mixedUpUpper; - static byte[] multiByte; - static byte[] mixPercentPattern; - + + private static byte[] red; + private static byte[] red2; // second copy of red, different object + private static byte[] green; + private static byte[] emptyString; + private static byte[] mixedUp; + private static byte[] mixedUpLower; + private static byte[] mixedUpUpper; + private static byte[] multiByte; + private static byte[] mixPercentPattern; + static { try { red = "red".getBytes("UTF-8"); @@ -40,14 +65,14 @@ red2 = new byte[red.length]; System.arraycopy(red, 0, red2, 0, red.length); } - + // add some multi-byte characters to test length routine later. // total characters = 4; byte length = 10 static void addMultiByteChars(byte[] b) { int i = 0; b[i++] = (byte) 0x41; // letter "A" (1 byte) b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes) - b[i++] = (byte) 0x80; + b[i++] = (byte) 0x80; b[i++] = (byte) 0xE2; // Euro sign (3 bytes) b[i++] = (byte) 0x82; b[i++] = (byte) 0xAC; @@ -56,9 +81,9 @@ static void addMultiByteChars(byte[] b) { b[i++] = (byte) 0xAD; b[i++] = (byte) 0xA2; } - + @Test - // Load a BytesColumnVector by copying in large data, enough to force + // Load a BytesColumnVector by copying in large data, enough to force // the buffer to expand. public void testLoadBytesColumnVectorByValueLargeData() { BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); @@ -78,7 +103,7 @@ public void testLoadBytesColumnVectorByValueLargeData() { } Assert.assertTrue(bcv.bufferSize() >= b.length * VectorizedRowBatch.DEFAULT_SIZE); } - + @Test // set values by reference, copy the data out, and verify equality public void testLoadBytesColumnVectorByRef() { @@ -102,14 +127,44 @@ public void testLoadBytesColumnVectorByRef() { } } + @Test + // Test string column to string literal comparison + public void testStringColCompareStringScalarFilter() { + VectorizedRowBatch batch = makeStringBatch(); + VectorExpression expr; + expr = new FilterStringColEqualStringScalar(0, red2); + expr.evaluate(batch); + + // only red qualifies, and it's in entry 0 + Assert.assertTrue(batch.size == 1); + Assert.assertTrue(batch.selected[0] == 0); + + batch = makeStringBatch(); + expr = new FilterStringColLessStringScalar(0, red2); + expr.evaluate(batch); + + // only green qualifies, and it's in entry 1 + Assert.assertTrue(batch.size == 1); + Assert.assertTrue(batch.selected[0] == 1); + + batch = makeStringBatch(); + expr = new FilterStringColGreaterEqualStringScalar(0, green); + expr.evaluate(batch); + + // green and red qualify + Assert.assertTrue(batch.size == 2); + Assert.assertTrue(batch.selected[0] == 0); + Assert.assertTrue(batch.selected[1] == 1); + } + VectorizedRowBatch makeStringBatch() { // create a batch with one string ("Bytes") column - VectorizedRowBatch batch = new VectorizedRowBatch(1,VectorizedRowBatch.DEFAULT_SIZE); + VectorizedRowBatch batch = new VectorizedRowBatch(1, VectorizedRowBatch.DEFAULT_SIZE); BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[0] = v; /* * Add these 3 values: - * + * * red * green * NULL @@ -120,23 +175,23 @@ VectorizedRowBatch makeStringBatch() { v.isNull[1] = false; v.setRef(2, emptyString, 0, emptyString.length); v.isNull[2] = true; - + v.noNulls = false; - + batch.size = 3; return batch; } - + VectorizedRowBatch makeStringBatchMixedCase() { // create a batch with two string ("Bytes") columns - VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE); + VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[0] = v; BytesColumnVector outV = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[1] = outV; /* * Add these 3 values: - * + * * mixedUp * green * NULL @@ -148,23 +203,23 @@ VectorizedRowBatch makeStringBatchMixedCase() { v.setRef(2, emptyString, 0, emptyString.length); v.isNull[2] = true; v.noNulls = false; - + batch.size = 3; return batch; } - + VectorizedRowBatch makeStringBatchMixedCharSize() { - // create a new batch with one char column (for input) + // create a new batch with one char column (for input) // and one long column (for output) - VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE); + VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[0] = v; LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[1] = outV; - + /* * Add these 3 values: - * + * * mixedUp * green * NULL @@ -179,82 +234,88 @@ VectorizedRowBatch makeStringBatchMixedCharSize() { v.noNulls = false; v.setRef(3, multiByte, 0, 10); v.isNull[3] = false; - + batch.size = 4; return batch; } - + @Test public void testColLower() { // has nulls, not repeating VectorizedRowBatch batch = makeStringBatchMixedCase(); - StringLower expr = new StringLower(0,1); + StringLower expr = new StringLower(0, 1); expr.evaluate(batch); BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; - int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); - Assert.assertEquals(0,cmp); + int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.isNull[2]); - int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], outCol.start[1], outCol.length[1]); - Assert.assertEquals(0,cmp2); - + int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + // no nulls, not repeating batch = makeStringBatchMixedCase(); batch.cols[0].noNulls = true; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; - cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); - Assert.assertEquals(0,cmp); + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.noNulls); - + // has nulls, is repeating batch = makeStringBatchMixedCase(); batch.cols[0].isRepeating = true; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; - cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); - Assert.assertEquals(0,cmp); + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.isRepeating); Assert.assertFalse(outCol.noNulls); - + // no nulls, is repeating batch = makeStringBatchMixedCase(); batch.cols[0].isRepeating = true; batch.cols[0].noNulls = true; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; - cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); - Assert.assertEquals(0,cmp); + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.isRepeating); - Assert.assertTrue(outCol.noNulls); + Assert.assertTrue(outCol.noNulls); } - + @Test public void testColUpper() { // no nulls, not repeating - + // We don't test all the combinations because (at least currently) // the logic is inherited to be the same as testColLower, which checks all the cases). VectorizedRowBatch batch = makeStringBatchMixedCase(); - StringUpper expr = new StringUpper(0,1); + StringUpper expr = new StringUpper(0, 1); batch.cols[0].noNulls = true; expr.evaluate(batch); BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; - int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], outCol.start[0], outCol.length[0]); - Assert.assertEquals(0,cmp); + int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.noNulls); } - + @Test public void testStringLength() { - + // has nulls, not repeating VectorizedRowBatch batch = makeStringBatchMixedCharSize(); - StringLength expr = new StringLength(0,1); + StringLength expr = new StringLength(0, 1); expr.evaluate(batch); LongColumnVector outCol = (LongColumnVector) batch.cols[1]; - Assert.assertEquals(5,outCol.vector[1]); // length of green is 5 + Assert.assertEquals(5, outCol.vector[1]); // length of green is 5 Assert.assertTrue(outCol.isNull[2]); - Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars + Assert.assertEquals(4, outCol.vector[3]); // this one has the mixed-size chars // no nulls, not repeating batch = makeStringBatchMixedCharSize(); @@ -262,8 +323,8 @@ public void testStringLength() { expr.evaluate(batch); outCol = (LongColumnVector) batch.cols[1]; Assert.assertTrue(outCol.noNulls); - Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars - + Assert.assertEquals(4, outCol.vector[3]); // this one has the mixed-size chars + // has nulls, is repeating batch = makeStringBatchMixedCharSize(); batch.cols[0].isRepeating = true; @@ -272,7 +333,7 @@ public void testStringLength() { Assert.assertTrue(outCol.isRepeating); Assert.assertFalse(outCol.noNulls); Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp" - + // no nulls, is repeating batch = makeStringBatchMixedCharSize(); batch.cols[0].isRepeating = true; @@ -281,6 +342,60 @@ public void testStringLength() { outCol = (LongColumnVector) batch.cols[1]; Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp" Assert.assertTrue(outCol.isRepeating); - Assert.assertTrue(outCol.noNulls); + Assert.assertTrue(outCol.noNulls); + } + + @Test + public void testStringLike() { + + // has nulls, not repeating + VectorizedRowBatch batch; + Text pattern; + int initialBatchSize; + batch = makeStringBatchMixedCharSize(); + pattern = new Text(mixPercentPattern); + FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern); + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + + // same, but repeating value is null + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + batch.cols[0].isNull[0] = true; + expr.evaluate(batch); + + // no rows qualify + Assert.assertEquals(0, batch.size); + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); } }