diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java new file mode 100644 index 0000000..24ba861 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.hive.ql.udf.UDFLike; + +/** + * Evaluate LIKE filter on a batch for a vector of strings. + */ +public class FilterStringColLikeStringScalar extends VectorExpression { + private int colNum; + private Text likePattern; + private Text s; + private UDFLike likeFunc; + + public FilterStringColLikeStringScalar(int colNum, Text likePattern) { + this.colNum = colNum; + this.likePattern = likePattern; + likeFunc = new UDFLike(); + s = new Text(); + } + + /* + * This vectorized version of LIKE calls the standard LIKE + * function code. In the future, as an optimization, consider + * unwinding some of that logic here, e.g. to determine + * if the LIKE pattern is a simple one like 'abc%' so that + * can be executed more efficiently as a special case. + */ + + private boolean like(byte[] bytes, int start, int len) { + s.set(bytes, start, len); + return (likeFunc.evaluate(s, likePattern)).get(); + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero Repeating property will not change. + if (!like(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + //All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + if (!like(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (like(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + + /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline + * future operations. So selectedInUse will remain false. + */ + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 9c894dc..c0e71e5 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -31,6 +31,7 @@ import java.io.UnsupportedEncodingException; import java.util.Arrays; +import org.apache.hadoop.io.Text; /** * Test vectorized expression and filter evaluation for strings. @@ -64,14 +65,14 @@ red2 = new byte[red.length]; System.arraycopy(red, 0, red2, 0, red.length); } - + // add some multi-byte characters to test length routine later. // total characters = 4; byte length = 10 static void addMultiByteChars(byte[] b) { int i = 0; b[i++] = (byte) 0x41; // letter "A" (1 byte) b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes) - b[i++] = (byte) 0x80; + b[i++] = (byte) 0x80; b[i++] = (byte) 0xE2; // Euro sign (3 bytes) b[i++] = (byte) 0x82; b[i++] = (byte) 0xAC; @@ -80,9 +81,9 @@ static void addMultiByteChars(byte[] b) { b[i++] = (byte) 0xAD; b[i++] = (byte) 0xA2; } - + @Test - // Load a BytesColumnVector by copying in large data, enough to force + // Load a BytesColumnVector by copying in large data, enough to force // the buffer to expand. public void testLoadBytesColumnVectorByValueLargeData() { BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); @@ -102,7 +103,7 @@ public void testLoadBytesColumnVectorByValueLargeData() { } Assert.assertTrue(bcv.bufferSize() >= b.length * VectorizedRowBatch.DEFAULT_SIZE); } - + @Test // set values by reference, copy the data out, and verify equality public void testLoadBytesColumnVectorByRef() { @@ -208,8 +209,8 @@ VectorizedRowBatch makeStringBatchMixedCase() { } VectorizedRowBatch makeStringBatchMixedCharSize() { - // create a new batch with one char column (for input) - // and one long column (for output) + + // create a new batch with one char column (for input) and one long column (for output) VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[0] = v; @@ -289,10 +290,12 @@ public void testColLower() { @Test public void testColUpper() { + // no nulls, not repeating - // We don't test all the combinations because (at least currently) - // the logic is inherited to be the same as testColLower, which checks all the cases). + /* We don't test all the combinations because (at least currently) + * the logic is inherited to be the same as testColLower, which checks all the cases). + */ VectorizedRowBatch batch = makeStringBatchMixedCase(); StringUpper expr = new StringUpper(0, 1); batch.cols[0].noNulls = true; @@ -332,7 +335,7 @@ public void testStringLength() { Assert.assertTrue(outCol.isRepeating); Assert.assertFalse(outCol.noNulls); Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp" - + // no nulls, is repeating batch = makeStringBatchMixedCharSize(); batch.cols[0].isRepeating = true; @@ -341,6 +344,60 @@ public void testStringLength() { outCol = (LongColumnVector) batch.cols[1]; Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp" Assert.assertTrue(outCol.isRepeating); - Assert.assertTrue(outCol.noNulls); + Assert.assertTrue(outCol.noNulls); + } + + @Test + public void testStringLike() { + + // has nulls, not repeating + VectorizedRowBatch batch; + Text pattern; + int initialBatchSize; + batch = makeStringBatchMixedCharSize(); + pattern = new Text(mixPercentPattern); + FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern); + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + + // same, but repeating value is null + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + batch.cols[0].isNull[0] = true; + expr.evaluate(batch); + + // no rows qualify + Assert.assertEquals(0, batch.size); + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); } }