diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index 24ba861..dbef635 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -21,35 +21,245 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.io.Text; -import org.apache.hadoop.hive.ql.udf.UDFLike; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.regex.Pattern; + +import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; /** * Evaluate LIKE filter on a batch for a vector of strings. */ public class FilterStringColLikeStringScalar extends VectorExpression { private int colNum; - private Text likePattern; - private Text s; - private UDFLike likeFunc; + private Pattern compiledPattern; + private final Text simplePattern = new Text(); + private ByteBuffer byteBuffer; + private CharBuffer charBuffer; + private CharsetDecoder decoder; + private PatternType type = PatternType.NONE; + + // Doing characters comparison directly instead of regular expression + // matching for simple patterns like "%abc%". + enum PatternType { + NONE, // "abc" + BEGIN, // "abc%" + END, // "%abc" + MIDDLE, // "%abc%" + COMPLEX, // all other cases, such as "ab%c_de" + } public FilterStringColLikeStringScalar(int colNum, Text likePattern) { this.colNum = colNum; - this.likePattern = likePattern; - likeFunc = new UDFLike(); - s = new Text(); + String stringLikePattern = likePattern.toString(); + parseSimplePattern(stringLikePattern); + if (type == PatternType.COMPLEX) { + compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern)); + } + decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + byteBuffer = ByteBuffer.allocate(4); + charBuffer = CharBuffer.allocate(4); } - /* - * This vectorized version of LIKE calls the standard LIKE - * function code. In the future, as an optimization, consider - * unwinding some of that logic here, e.g. to determine - * if the LIKE pattern is a simple one like 'abc%' so that - * can be executed more efficiently as a special case. - */ + PatternType getType() { + return type; + } private boolean like(byte[] bytes, int start, int len) { - s.set(bytes, start, len); - return (likeFunc.evaluate(s, likePattern)).get(); + switch (type) { + case NONE: + return noneLike(bytes, start, len, simplePattern.getBytes()); + case BEGIN: + return beginLike(bytes, start, len, simplePattern.getBytes()); + case END: + return endLike(bytes, start, len, simplePattern.getBytes()); + case MIDDLE: + return midLike(bytes, start, len, simplePattern.getBytes()); + case COMPLEX: + return complexLike(bytes, start, len); + default: + return false; + } + } + + private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) { + int lenSub = byteSub.length; + if (len != lenSub) { + return false; + } + for (int i = start, j = 0; j < len; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + + private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) { + if (len < byteSub.length) { + return false; + } + for (int i = start, j = 0; j < byteSub.length; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + + private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + + private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + int end = start + len - lenSub + 1; + boolean match = false; + for (int i = start; (i < end) && (!match); i++) { + match = true; + for (int j = 0; j < lenSub; j++) { + if (byteS[i + j] != byteSub[j]) { + match = false; + break; + } + } + } + return match; + } + + /** + * Matches the byte array against the complex like pattern. This method uses + * {@link #compiledPattern} to match. For decoding performance, it caches + * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}. + * When the length to decode is greater than the capacity of + * {@link #byteBuffer}, it creates new {@link #byteBuffer} and + * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the + * double of the length, for fewer object creations and higher memory + * utilization. + * + * @param byteS + * A byte array that contains a UTF-8 string. + * @param start + * A position to start decoding. + * @param len + * A length to decode. + * @return + * true if the byte array matches the complex like pattern, + * otherwise false. + */ + private boolean complexLike(byte[] byteS, int start, int len) { + // Prepare buffers + if (byteBuffer.capacity() < len) { + byteBuffer = ByteBuffer.allocate(len * 2); + } + byteBuffer.clear(); + byteBuffer.put(byteS, start, len); + byteBuffer.flip(); + + int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); + if (charBuffer.capacity() < maxChars) { + charBuffer = CharBuffer.allocate(maxChars); + } + charBuffer.clear(); + + // Decode UTF-8 + decoder.reset(); + decoder.decode(byteBuffer, charBuffer, true); + decoder.flush(charBuffer); + charBuffer.flip(); + + // Match the given bytes with the like pattern + return compiledPattern.matcher(charBuffer).matches(); + } + + /** + * Parses the likePattern. Based on it is a simple pattern or not, the + * function might change two member variables. {@link #type} will be changed + * to the corresponding pattern type; {@link #simplePattern} will record the + * string in it for later pattern matching if it is a simple pattern. + *

+ * Examples:

+ * + *
+   * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
+   * and changes {@link #simplePattern} to "abc"
+   * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
+   * and does not change {@link #simplePattern}
+   * 
+ * + *
+ * + * @param likePattern + * the input LIKE query pattern + */ + private void parseSimplePattern(String likePattern) { + int length = likePattern.length(); + int beginIndex = 0; + int endIndex = length; + char lastChar = 'a'; + String strPattern = new String(); + type = PatternType.NONE; + + for (int i = 0; i < length; i++) { + char n = likePattern.charAt(i); + if (n == '_') { // such as "a_b" + if (lastChar != '\\') { // such as "a%bc" + type = PatternType.COMPLEX; + return; + } else { // such as "abc\%de%" + strPattern += likePattern.substring(beginIndex, i - 1); + beginIndex = i; + } + } else if (n == '%') { + if (i == 0) { // such as "%abc" + type = PatternType.END; + beginIndex = 1; + } else if (i < length - 1) { + if (lastChar != '\\') { // such as "a%bc" + type = PatternType.COMPLEX; + return; + } else { // such as "abc\%de%" + strPattern += likePattern.substring(beginIndex, i - 1); + beginIndex = i; + } + } else { + if (lastChar != '\\') { + endIndex = length - 1; + if (type == PatternType.END) { // such as "%abc%" + type = PatternType.MIDDLE; + } else { + type = PatternType.BEGIN; // such as "abc%" + } + } else { // such as "abc\%" + strPattern += likePattern.substring(beginIndex, i - 1); + beginIndex = i; + endIndex = length; + } + } + } + lastChar = n; + } + + strPattern += likePattern.substring(beginIndex, endIndex); + simplePattern.set(strPattern); } @Override @@ -61,7 +271,7 @@ public void evaluate(VectorizedRowBatch batch) { byte[][] vector = inputColVector.vector; int[] length = inputColVector.length; int[] start = inputColVector.start; - + byte[] simplePatternBytes = simplePattern.getBytes(); // return immediately if batch is empty if (n == 0) { @@ -74,25 +284,97 @@ public void evaluate(VectorizedRowBatch batch) { // All must be selected otherwise size would be zero Repeating property will not change. if (!like(vector[0], start[0], length[0])) { - //Entire batch is filtered out. + // Entire batch is filtered out. batch.size = 0; } } else if (batch.selectedInUse) { int newSize = 0; - for(int j=0; j != n; j++) { - int i = sel[j]; - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } + + switch (type) { + case NONE: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case BEGIN: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case END: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case MIDDLE: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case COMPLEX: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (complexLike(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + break; } + batch.size = newSize; } else { int newSize = 0; - for(int i = 0; i != n; i++) { - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } + + switch (type) { + case NONE: + for (int i = 0; i != n; i++) { + if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case BEGIN: + for (int i = 0; i != n; i++) { + if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case END: + for (int i = 0; i != n; i++) { + if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case MIDDLE: + for (int i = 0; i != n; i++) { + if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + break; + case COMPLEX: + for (int i = 0; i != n; i++) { + if (complexLike(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + break; } + if (newSize < n) { batch.size = newSize; batch.selectedInUse = true; @@ -113,26 +395,113 @@ public void evaluate(VectorizedRowBatch batch) { } } else if (batch.selectedInUse) { int newSize = 0; - for(int j=0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } + + switch (type) { + case NONE: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case BEGIN: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case END: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case MIDDLE: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case COMPLEX: + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (complexLike(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + break; } //Change the selected vector batch.size = newSize; } else { int newSize = 0; - for(int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; + + switch (type) { + case NONE: + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } } - } + break; + case BEGIN: + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case END: + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case MIDDLE: + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { + sel[newSize++] = i; + } + } + } + break; + case COMPLEX: + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (complexLike(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + break; } + if (newSize < n) { batch.size = newSize; batch.selectedInUse = true; diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 6e26412..c76060b 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -624,6 +624,52 @@ public void testStringLike() { Assert.assertEquals(initialBatchSize, batch.size); } + public void testStringLikePatternType() { + FilterStringColLikeStringScalar expr; + + // BEGIN pattern + expr = new FilterStringColLikeStringScalar(0, new Text("abc%")); + Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN, + expr.getType()); + + // END pattern + expr = new FilterStringColLikeStringScalar(0, new Text("%abc")); + Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END, + expr.getType()); + + // MIDDLE pattern + expr = new FilterStringColLikeStringScalar(0, new Text("%abc%")); + Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE, + expr.getType()); + + // COMPLEX pattern + expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de")); + Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX, + expr.getType()); + + // NONE pattern + expr = new FilterStringColLikeStringScalar(0, new Text("abc")); + Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE, + expr.getType()); + } + + public void testStringLikeMultiByte() { + FilterStringColLikeStringScalar expr; + VectorizedRowBatch batch; + + // verify that a multi byte LIKE expression matches a matching string + batch = makeStringBatchMixedCharSize(); + expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "%")); + expr.evaluate(batch); + Assert.assertEquals(batch.size, 1); + + // verify that a multi byte LIKE expression doesn't match a non-matching string + batch = makeStringBatchMixedCharSize(); + expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "x")); + expr.evaluate(batch); + Assert.assertEquals(batch.size, 0); + } + @Test public void testColConcatScalar() {