diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index 24ba861..dbef635 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -21,35 +21,245 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.io.Text; -import org.apache.hadoop.hive.ql.udf.UDFLike; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.regex.Pattern; + +import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; /** * Evaluate LIKE filter on a batch for a vector of strings. */ public class FilterStringColLikeStringScalar extends VectorExpression { private int colNum; - private Text likePattern; - private Text s; - private UDFLike likeFunc; + private Pattern compiledPattern; + private final Text simplePattern = new Text(); + private ByteBuffer byteBuffer; + private CharBuffer charBuffer; + private CharsetDecoder decoder; + private PatternType type = PatternType.NONE; + + // Doing characters comparison directly instead of regular expression + // matching for simple patterns like "%abc%". + enum PatternType { + NONE, // "abc" + BEGIN, // "abc%" + END, // "%abc" + MIDDLE, // "%abc%" + COMPLEX, // all other cases, such as "ab%c_de" + } public FilterStringColLikeStringScalar(int colNum, Text likePattern) { this.colNum = colNum; - this.likePattern = likePattern; - likeFunc = new UDFLike(); - s = new Text(); + String stringLikePattern = likePattern.toString(); + parseSimplePattern(stringLikePattern); + if (type == PatternType.COMPLEX) { + compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern)); + } + decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + byteBuffer = ByteBuffer.allocate(4); + charBuffer = CharBuffer.allocate(4); } - /* - * This vectorized version of LIKE calls the standard LIKE - * function code. In the future, as an optimization, consider - * unwinding some of that logic here, e.g. to determine - * if the LIKE pattern is a simple one like 'abc%' so that - * can be executed more efficiently as a special case. - */ + PatternType getType() { + return type; + } private boolean like(byte[] bytes, int start, int len) { - s.set(bytes, start, len); - return (likeFunc.evaluate(s, likePattern)).get(); + switch (type) { + case NONE: + return noneLike(bytes, start, len, simplePattern.getBytes()); + case BEGIN: + return beginLike(bytes, start, len, simplePattern.getBytes()); + case END: + return endLike(bytes, start, len, simplePattern.getBytes()); + case MIDDLE: + return midLike(bytes, start, len, simplePattern.getBytes()); + case COMPLEX: + return complexLike(bytes, start, len); + default: + return false; + } + } + + private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) { + int lenSub = byteSub.length; + if (len != lenSub) { + return false; + } + for (int i = start, j = 0; j < len; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + + private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) { + if (len < byteSub.length) { + return false; + } + for (int i = start, j = 0; j < byteSub.length; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + + private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + + private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + int end = start + len - lenSub + 1; + boolean match = false; + for (int i = start; (i < end) && (!match); i++) { + match = true; + for (int j = 0; j < lenSub; j++) { + if (byteS[i + j] != byteSub[j]) { + match = false; + break; + } + } + } + return match; + } + + /** + * Matches the byte array against the complex like pattern. This method uses + * {@link #compiledPattern} to match. For decoding performance, it caches + * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}. + * When the length to decode is greater than the capacity of + * {@link #byteBuffer}, it creates new {@link #byteBuffer} and + * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the + * double of the length, for fewer object creations and higher memory + * utilization. + * + * @param byteS + * A byte array that contains a UTF-8 string. + * @param start + * A position to start decoding. + * @param len + * A length to decode. + * @return + * true if the byte array matches the complex like pattern, + * otherwise false. + */ + private boolean complexLike(byte[] byteS, int start, int len) { + // Prepare buffers + if (byteBuffer.capacity() < len) { + byteBuffer = ByteBuffer.allocate(len * 2); + } + byteBuffer.clear(); + byteBuffer.put(byteS, start, len); + byteBuffer.flip(); + + int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); + if (charBuffer.capacity() < maxChars) { + charBuffer = CharBuffer.allocate(maxChars); + } + charBuffer.clear(); + + // Decode UTF-8 + decoder.reset(); + decoder.decode(byteBuffer, charBuffer, true); + decoder.flush(charBuffer); + charBuffer.flip(); + + // Match the given bytes with the like pattern + return compiledPattern.matcher(charBuffer).matches(); + } + + /** + * Parses the likePattern. Based on it is a simple pattern or not, the + * function might change two member variables. {@link #type} will be changed + * to the corresponding pattern type; {@link #simplePattern} will record the + * string in it for later pattern matching if it is a simple pattern. + *
+ * Examples:
+ *
+ *
+ * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
+ * and changes {@link #simplePattern} to "abc"
+ * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
+ * and does not change {@link #simplePattern}
+ *
+ *
+ *
+ *
+ * @param likePattern
+ * the input LIKE query pattern
+ */
+ private void parseSimplePattern(String likePattern) {
+ int length = likePattern.length();
+ int beginIndex = 0;
+ int endIndex = length;
+ char lastChar = 'a';
+ String strPattern = new String();
+ type = PatternType.NONE;
+
+ for (int i = 0; i < length; i++) {
+ char n = likePattern.charAt(i);
+ if (n == '_') { // such as "a_b"
+ if (lastChar != '\\') { // such as "a%bc"
+ type = PatternType.COMPLEX;
+ return;
+ } else { // such as "abc\%de%"
+ strPattern += likePattern.substring(beginIndex, i - 1);
+ beginIndex = i;
+ }
+ } else if (n == '%') {
+ if (i == 0) { // such as "%abc"
+ type = PatternType.END;
+ beginIndex = 1;
+ } else if (i < length - 1) {
+ if (lastChar != '\\') { // such as "a%bc"
+ type = PatternType.COMPLEX;
+ return;
+ } else { // such as "abc\%de%"
+ strPattern += likePattern.substring(beginIndex, i - 1);
+ beginIndex = i;
+ }
+ } else {
+ if (lastChar != '\\') {
+ endIndex = length - 1;
+ if (type == PatternType.END) { // such as "%abc%"
+ type = PatternType.MIDDLE;
+ } else {
+ type = PatternType.BEGIN; // such as "abc%"
+ }
+ } else { // such as "abc\%"
+ strPattern += likePattern.substring(beginIndex, i - 1);
+ beginIndex = i;
+ endIndex = length;
+ }
+ }
+ }
+ lastChar = n;
+ }
+
+ strPattern += likePattern.substring(beginIndex, endIndex);
+ simplePattern.set(strPattern);
}
@Override
@@ -61,7 +271,7 @@ public void evaluate(VectorizedRowBatch batch) {
byte[][] vector = inputColVector.vector;
int[] length = inputColVector.length;
int[] start = inputColVector.start;
-
+ byte[] simplePatternBytes = simplePattern.getBytes();
// return immediately if batch is empty
if (n == 0) {
@@ -74,25 +284,97 @@ public void evaluate(VectorizedRowBatch batch) {
// All must be selected otherwise size would be zero Repeating property will not change.
if (!like(vector[0], start[0], length[0])) {
- //Entire batch is filtered out.
+ // Entire batch is filtered out.
batch.size = 0;
}
} else if (batch.selectedInUse) {
int newSize = 0;
- for(int j=0; j != n; j++) {
- int i = sel[j];
- if (like(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
+
+ switch (type) {
+ case NONE:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case BEGIN:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case END:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case MIDDLE:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case COMPLEX:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (complexLike(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
}
+
batch.size = newSize;
} else {
int newSize = 0;
- for(int i = 0; i != n; i++) {
- if (like(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
+
+ switch (type) {
+ case NONE:
+ for (int i = 0; i != n; i++) {
+ if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case BEGIN:
+ for (int i = 0; i != n; i++) {
+ if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case END:
+ for (int i = 0; i != n; i++) {
+ if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case MIDDLE:
+ for (int i = 0; i != n; i++) {
+ if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
+ case COMPLEX:
+ for (int i = 0; i != n; i++) {
+ if (complexLike(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ break;
}
+
if (newSize < n) {
batch.size = newSize;
batch.selectedInUse = true;
@@ -113,26 +395,113 @@ public void evaluate(VectorizedRowBatch batch) {
}
} else if (batch.selectedInUse) {
int newSize = 0;
- for(int j=0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (like(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
+
+ switch (type) {
+ case NONE:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case BEGIN:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case END:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case MIDDLE:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case COMPLEX:
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (complexLike(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
}
//Change the selected vector
batch.size = newSize;
} else {
int newSize = 0;
- for(int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (like(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
+
+ switch (type) {
+ case NONE:
+ for (int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
}
- }
+ break;
+ case BEGIN:
+ for (int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case END:
+ for (int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case MIDDLE:
+ for (int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
+ case COMPLEX:
+ for (int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (complexLike(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ break;
}
+
if (newSize < n) {
batch.size = newSize;
batch.selectedInUse = true;
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/OldFilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/OldFilterStringColLikeStringScalar.java
new file mode 100644
index 0000000..aa134ba
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/OldFilterStringColLikeStringScalar.java
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+
+/**
+ * Evaluate LIKE filter on a batch for a vector of strings.
+ */
+public class OldFilterStringColLikeStringScalar extends VectorExpression {
+ private int colNum;
+ private Text likePattern;
+ private Text s;
+ private UDFLike likeFunc;
+
+ public OldFilterStringColLikeStringScalar(int colNum, Text likePattern) {
+ this.colNum = colNum;
+ this.likePattern = likePattern;
+ likeFunc = new UDFLike();
+ s = new Text();
+ }
+
+ /*
+ * This vectorized version of LIKE calls the standard LIKE
+ * function code. In the future, as an optimization, consider
+ * unwinding some of that logic here, e.g. to determine
+ * if the LIKE pattern is a simple one like 'abc%' so that
+ * can be executed more efficiently as a special case.
+ */
+
+ private boolean like(byte[] bytes, int start, int len) {
+ s.set(bytes, start, len);
+ return (likeFunc.evaluate(s, likePattern)).get();
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+ BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+ int[] sel = batch.selected;
+ boolean[] nullPos = inputColVector.isNull;
+ int n = batch.size;
+ byte[][] vector = inputColVector.vector;
+ int[] length = inputColVector.length;
+ int[] start = inputColVector.start;
+
+
+ // return immediately if batch is empty
+ if (n == 0) {
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (inputColVector.isRepeating) {
+
+ // All must be selected otherwise size would be zero Repeating property will not change.
+ if (!like(vector[0], start[0], length[0])) {
+
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j=0; j != n; j++) {
+ int i = sel[j];
+ if (like(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (like(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ } else {
+ if (inputColVector.isRepeating) {
+
+ //All must be selected otherwise size would be zero. Repeating property will not change.
+ if (!nullPos[0]) {
+ if (!like(vector[0], start[0], length[0])) {
+
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else {
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j=0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (like(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+
+ //Change the selected vector
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (like(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+
+ /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
+ * future operations. So selectedInUse will remain false.
+ */
+ }
+ }
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return -1;
+ }
+
+ @Override
+ public String getOutputType() {
+ return "boolean";
+ }
+}
diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestConstantVectorExpression.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestConstantVectorExpression.java
index f2e5399..c656ce4 100644
--- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestConstantVectorExpression.java
+++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestConstantVectorExpression.java
@@ -27,6 +27,7 @@
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.exec.vector.util.VectorizedRowGroupGenUtil;
import org.junit.Test;
public class TestConstantVectorExpression {
diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestFilterStringColLikeStringScalar.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestFilterStringColLikeStringScalar.java
new file mode 100644
index 0000000..2d51ff2
--- /dev/null
+++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestFilterStringColLikeStringScalar.java
@@ -0,0 +1,162 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import junit.framework.Assert;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+import java.io.UnsupportedEncodingException;
+import java.util.regex.Pattern;
+
+public class TestFilterStringColLikeStringScalar {
+ private static byte[] red;
+ private static byte[] red2; // second copy of red, different object
+ private static byte[] green;
+ private static byte[] emptyString;
+ private static byte[] mixedUp;
+ private static byte[] multiByte;
+ private static byte[] beginLikePattern;
+ private static byte[] endLikePattern;
+ private static byte[] midLikePattern;
+ private static byte[] complexLikePattern;
+ private static byte[] noneLikePattern;
+ private static final int SIZE = 100000;
+
+ static {
+ try {
+ red = "red".getBytes("UTF-8");
+ green = "green".getBytes("UTF-8");
+ emptyString = "".getBytes("UTF-8");
+ mixedUp = "mixedUp".getBytes("UTF-8");
+ beginLikePattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+ endLikePattern = "%Up".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+ midLikePattern = "%dU%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+ complexLikePattern = "m%dU%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+ noneLikePattern = "mixedUp".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+ multiByte = new byte[100];
+ addMultiByteChars(multiByte);
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ red2 = new byte[red.length];
+ System.arraycopy(red, 0, red2, 0, red.length);
+ }
+
+ // add some multi-byte characters to test length routine later.
+ // total characters = 4; byte length = 10
+ static void addMultiByteChars(byte[] b) {
+ int i = 0;
+ b[i++] = (byte) 0x41; // letter "A" (1 byte)
+ b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes)
+ b[i++] = (byte) 0x80;
+ b[i++] = (byte) 0xE2; // Euro sign (3 bytes)
+ b[i++] = (byte) 0x82;
+ b[i++] = (byte) 0xAC;
+ b[i++] = (byte) 0xF0; // Asian character U+24B62 (4 bytes)
+ b[i++] = (byte) 0xA4;
+ b[i++] = (byte) 0xAD;
+ b[i++] = (byte) 0xA2;
+ }
+
+ VectorizedRowBatch makeStringBatchMixedCharSize() {
+
+ // create a new batch with one char column (for input) and one long column (for output)
+ VectorizedRowBatch batch = new VectorizedRowBatch(2, SIZE);
+ BytesColumnVector v = new BytesColumnVector(SIZE);
+ batch.cols[0] = v;
+ LongColumnVector outV = new LongColumnVector(SIZE);
+ batch.cols[1] = outV;
+
+ /*
+ * Add these 3 values:
+ *
+ * mixedUp
+ * green
+ * NULL
+ * <4 char string with mult-byte chars>
+ */
+ for (int i = 0; i < SIZE; i += 4 ) {
+ v.setRef(i + 0, mixedUp, 0, mixedUp.length);
+ v.isNull[i + 0] = false;
+ v.setRef(i + 1, green, 0, green.length);
+ v.isNull[i + 1] = false;
+ v.setRef(i + 2, emptyString, 0, emptyString.length);
+ v.isNull[i + 2] = true;
+ v.noNulls = false;
+ v.setRef(i + 3, multiByte, 0, 10);
+ v.isNull[i + 3] = false;
+ }
+
+ batch.size = SIZE;
+ return batch;
+ }
+
+ private void testLike(byte[] patternBytes) {
+ long startTime;
+ Text pattern;
+ VectorizedRowBatch batch;
+ long newExprTime;
+ long oldExprTime;
+ pattern = new Text(patternBytes);
+ batch = makeStringBatchMixedCharSize();
+ Pattern.compile(pattern.toString()); // Warm up
+
+ System.out.println("----");
+ System.out.println(pattern);
+
+ // Prepare
+ FilterStringColLikeStringScalar newExpr = new FilterStringColLikeStringScalar(0, pattern);
+
+ // Test 100000 times
+ startTime = System.currentTimeMillis();
+ for (int i = 0; i < 1000; i++) {
+ newExpr.evaluate(batch);
+ Assert.assertEquals(SIZE / 4, batch.size);
+ Assert.assertEquals(0, batch.selected[0]);
+ }
+ newExprTime = System.currentTimeMillis() - startTime;
+ System.out.println("new " + newExprTime + "ms.");
+
+ // Prepare
+ OldFilterStringColLikeStringScalar oldExpr = new OldFilterStringColLikeStringScalar(0, pattern);
+
+ // Test 100000 times
+ startTime = System.currentTimeMillis();
+ for (int i = 0; i < 1000; i++) {
+ oldExpr.evaluate(batch);
+ Assert.assertEquals(SIZE / 4, batch.size);
+ Assert.assertEquals(0, batch.selected[0]);
+ }
+ oldExprTime = System.currentTimeMillis() - startTime;
+ System.out.println("old " + oldExprTime + "ms.");
+
+ System.out.println((oldExprTime / (float)newExprTime - 1) * 100 + "%\tfaster");
+ }
+
+ @Test
+ public void testBeginLike() {
+ testLike(beginLikePattern);
+ }
+
+ @Test
+ public void testEndLike() {
+ testLike(endLikePattern);
+ }
+
+ @Test
+ public void testMidLike() {
+ testLike(midLikePattern);
+ }
+
+ @Test
+ public void testComplexLike() {
+ testLike(complexLikePattern);
+ }
+
+ @Test
+ public void testNoneLike() {
+ testLike(noneLikePattern);
+ }
+}
diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
index 6e26412..6b3db27 100644
--- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
+++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
@@ -624,6 +624,35 @@ public void testStringLike() {
Assert.assertEquals(initialBatchSize, batch.size);
}
+ public void testStringLikePatternType() {
+ FilterStringColLikeStringScalar expr;
+
+ // BEGIN pattern
+ expr = new FilterStringColLikeStringScalar(0, new Text("abc%"));
+ Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN,
+ expr.getType());
+
+ // END pattern
+ expr = new FilterStringColLikeStringScalar(0, new Text("%abc"));
+ Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END,
+ expr.getType());
+
+ // MIDDLE pattern
+ expr = new FilterStringColLikeStringScalar(0, new Text("%abc%"));
+ Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE,
+ expr.getType());
+
+ // COMPLEX pattern
+ expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de"));
+ Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX,
+ expr.getType());
+
+ // NONE pattern
+ expr = new FilterStringColLikeStringScalar(0, new Text("abc"));
+ Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE,
+ expr.getType());
+ }
+
@Test
public void testColConcatScalar() {