diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java new file mode 100644 index 0000000..da39b60 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java @@ -0,0 +1,365 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.io.Text; + +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; + +abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression { + private final int colNum; + private Checker checker; + + public AbstractFilterStringColLikeStringScalar(int colNum, Text likePattern) { + this.colNum = colNum; + checker = createChecker(likePattern.toString()); + } + + protected abstract List getCheckerFactories(); + + private Checker createChecker(String pattern) { + for (CheckerFactory checkerFactory : getCheckerFactories()) { + Checker checker = checkerFactory.tryCreate(pattern); + if (checker != null) { + return checker; + } + } + return null; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero Repeating property will not change. + if (!checker.check(vector[0], start[0], length[0])) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + + batch.size = newSize; + } else { + int newSize = 0; + for (int i = 0; i != n; i++) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + //All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + if (!checker.check(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + + /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline + * future operations. So selectedInUse will remain false. + */ + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } + + protected static interface Checker { + boolean check(byte[] byteS, int start, int len); + } + + protected static interface CheckerFactory { + Checker tryCreate(String pattern); + } + + protected static class NoneChecker implements Checker { + byte [] byteSub; + + public NoneChecker(String regExpPattern) { + byteSub = new Text(regExpPattern).getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + int lenSub = byteSub.length; + if (len != lenSub) { + return false; + } + for (int i = start, j = 0; j < len; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + protected static class BeginChecker implements Checker { + byte [] byteSub; + + public BeginChecker(String group) { + byteSub = new Text(group).getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + if (len < byteSub.length) { + return false; + } + for (int i = start, j = 0; j < byteSub.length; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + protected static class EndChecker implements Checker { + byte [] byteSub; + + public EndChecker(String group) { + byteSub = new Text(group).getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + protected static class MiddleChecker implements Checker { + byte [] byteSub; + int lenSub; + + public MiddleChecker(String group) { + try { + byteSub = group.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + lenSub = byteSub.length; + } + + public boolean check(byte[] byteS, int start, int len) { + if (len < lenSub) { + return false; + } + int end = start + len - lenSub + 1; + boolean match = false; + for (int i = start; i < end; i++) { + match = true; + for (int j = 0; j < lenSub; j++) { + if (byteS[i + j] != byteSub[j]) { + match = false; + break; + } + } + if (match) { + return true; + } + } + return match; + } + } + + protected static class AnyCharChecker implements Checker { + private String regExpPattern; + private FastUTF8Decoder decoder; + private int patternLength; + private char anySingleCharPattern; + + public AnyCharChecker(char anySingleCharPattern, String regExpPattern) { + this.regExpPattern = regExpPattern; + this.patternLength = regExpPattern.length(); + this.anySingleCharPattern = anySingleCharPattern; + this.decoder = new FastUTF8Decoder(); + } + + public boolean check(byte[] byteS, int start, int len) { + CharBuffer decoded = decoder.decodeUnsafely(byteS, start, len); + if (decoded.length() != patternLength) { + return false; + } + + for (int i = 0; i < patternLength; i++) { + char c = regExpPattern.charAt(i); + if (c == anySingleCharPattern) { + continue; + } + if (c != decoded.charAt(i)) { + return false; + } + } + + return true; + } + } + + protected static class ComplexChecker implements Checker { + private final Pattern compiledPattern; + private final Matcher matcher; + private final FastUTF8Decoder decoder; + + public ComplexChecker(String regExpPattern) { + compiledPattern = Pattern.compile(regExpPattern); + matcher = compiledPattern.matcher(""); + decoder = new FastUTF8Decoder(); + } + + public boolean check(byte[] byteS, int start, int len) { + // Match the given bytes with the like pattern + matcher.reset(decoder.decodeUnsafely(byteS, start, len)); + return matcher.matches(); + } + } + + private static class FastUTF8Decoder { + private final CharsetDecoder decoder; + + private ByteBuffer byteBuffer; + private CharBuffer charBuffer; + + public FastUTF8Decoder() { + decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + byteBuffer = ByteBuffer.allocate(4); + charBuffer = CharBuffer.allocate(4); + } + + public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) { + // Prepare buffers + if (byteBuffer.capacity() < len) { + byteBuffer = ByteBuffer.allocate(len * 2); + } + byteBuffer.clear(); + byteBuffer.put(byteS, start, len); + byteBuffer.flip(); + + int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); + if (charBuffer.capacity() < maxChars) { + charBuffer = CharBuffer.allocate(maxChars); + } + charBuffer.clear(); + + // Decode UTF-8 + decoder.reset(); + decoder.decode(byteBuffer, charBuffer, true); + decoder.flush(charBuffer); + charBuffer.flip(); + + return charBuffer; + } + } + + String getCheckerName() { + return checker.getClass().getSimpleName(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index dbef635..705af6f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -18,509 +18,99 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.udf.UDFLike; import org.apache.hadoop.io.Text; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; - /** * Evaluate LIKE filter on a batch for a vector of strings. */ -public class FilterStringColLikeStringScalar extends VectorExpression { - private int colNum; - private Pattern compiledPattern; - private final Text simplePattern = new Text(); - private ByteBuffer byteBuffer; - private CharBuffer charBuffer; - private CharsetDecoder decoder; - private PatternType type = PatternType.NONE; +public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar { - // Doing characters comparison directly instead of regular expression - // matching for simple patterns like "%abc%". - enum PatternType { - NONE, // "abc" - BEGIN, // "abc%" - END, // "%abc" - MIDDLE, // "%abc%" - COMPLEX, // all other cases, such as "ab%c_de" - } + private static List checkerFactories = Arrays.asList( + new LikeBeginCheckerFactory(), + new LikeEndCheckerFactory(), + new LikeMiddleCheckerFactory(), + new LikeAnyCharCheckerFactory(), + new LikeNoneCheckerFactory(), + new LikeComplexCheckerFactory()); public FilterStringColLikeStringScalar(int colNum, Text likePattern) { - this.colNum = colNum; - String stringLikePattern = likePattern.toString(); - parseSimplePattern(stringLikePattern); - if (type == PatternType.COMPLEX) { - compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern)); - } - decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - byteBuffer = ByteBuffer.allocate(4); - charBuffer = CharBuffer.allocate(4); + super(colNum, likePattern); } - PatternType getType() { - return type; - } - - private boolean like(byte[] bytes, int start, int len) { - switch (type) { - case NONE: - return noneLike(bytes, start, len, simplePattern.getBytes()); - case BEGIN: - return beginLike(bytes, start, len, simplePattern.getBytes()); - case END: - return endLike(bytes, start, len, simplePattern.getBytes()); - case MIDDLE: - return midLike(bytes, start, len, simplePattern.getBytes()); - case COMPLEX: - return complexLike(bytes, start, len); - default: - return false; - } + @Override + protected List getCheckerFactories() { + return checkerFactories; } - private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len != lenSub) { - return false; - } - for (int i = start, j = 0; j < len; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; - } - } - return true; - } + private static class LikeBeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%"); - private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) { - if (len < byteSub.length) { - return false; - } - for (int i = start, j = 0; j < byteSub.length; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); } + return null; } - return true; } - private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len < lenSub) { - return false; - } - for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; - } - } - return true; - } + private static class LikeEndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)"); - private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len < lenSub) { - return false; - } - int end = start + len - lenSub + 1; - boolean match = false; - for (int i = start; (i < end) && (!match); i++) { - match = true; - for (int j = 0; j < lenSub; j++) { - if (byteS[i + j] != byteSub[j]) { - match = false; - break; - } + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); } + return null; } - return match; } - /** - * Matches the byte array against the complex like pattern. This method uses - * {@link #compiledPattern} to match. For decoding performance, it caches - * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}. - * When the length to decode is greater than the capacity of - * {@link #byteBuffer}, it creates new {@link #byteBuffer} and - * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the - * double of the length, for fewer object creations and higher memory - * utilization. - * - * @param byteS - * A byte array that contains a UTF-8 string. - * @param start - * A position to start decoding. - * @param len - * A length to decode. - * @return - * true if the byte array matches the complex like pattern, - * otherwise false. - */ - private boolean complexLike(byte[] byteS, int start, int len) { - // Prepare buffers - if (byteBuffer.capacity() < len) { - byteBuffer = ByteBuffer.allocate(len * 2); - } - byteBuffer.clear(); - byteBuffer.put(byteS, start, len); - byteBuffer.flip(); + private static class LikeMiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%"); - int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); - if (charBuffer.capacity() < maxChars) { - charBuffer = CharBuffer.allocate(maxChars); + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); + } + return null; } - charBuffer.clear(); - - // Decode UTF-8 - decoder.reset(); - decoder.decode(byteBuffer, charBuffer, true); - decoder.flush(charBuffer); - charBuffer.flip(); - - // Match the given bytes with the like pattern - return compiledPattern.matcher(charBuffer).matches(); } - /** - * Parses the likePattern. Based on it is a simple pattern or not, the - * function might change two member variables. {@link #type} will be changed - * to the corresponding pattern type; {@link #simplePattern} will record the - * string in it for later pattern matching if it is a simple pattern. - *

- * Examples:

- * - *
-   * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
-   * and changes {@link #simplePattern} to "abc"
-   * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
-   * and does not change {@link #simplePattern}
-   * 
- * - *
- * - * @param likePattern - * the input LIKE query pattern - */ - private void parseSimplePattern(String likePattern) { - int length = likePattern.length(); - int beginIndex = 0; - int endIndex = length; - char lastChar = 'a'; - String strPattern = new String(); - type = PatternType.NONE; + private static class LikeAnyCharCheckerFactory implements CheckerFactory { + private static final Pattern ANY_CHAR_PATTERN = Pattern.compile("[^%_]*_[^%]*"); - for (int i = 0; i < length; i++) { - char n = likePattern.charAt(i); - if (n == '_') { // such as "a_b" - if (lastChar != '\\') { // such as "a%bc" - type = PatternType.COMPLEX; - return; - } else { // such as "abc\%de%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - } - } else if (n == '%') { - if (i == 0) { // such as "%abc" - type = PatternType.END; - beginIndex = 1; - } else if (i < length - 1) { - if (lastChar != '\\') { // such as "a%bc" - type = PatternType.COMPLEX; - return; - } else { // such as "abc\%de%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - } - } else { - if (lastChar != '\\') { - endIndex = length - 1; - if (type == PatternType.END) { // such as "%abc%" - type = PatternType.MIDDLE; - } else { - type = PatternType.BEGIN; // such as "abc%" - } - } else { // such as "abc\%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - endIndex = length; - } - } + public Checker tryCreate(String pattern) { + Matcher matcher = ANY_CHAR_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new AnyCharChecker('_', pattern); } - lastChar = n; + return null; } - - strPattern += likePattern.substring(beginIndex, endIndex); - simplePattern.set(strPattern); } - @Override - public void evaluate(VectorizedRowBatch batch) { - BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; - int[] sel = batch.selected; - boolean[] nullPos = inputColVector.isNull; - int n = batch.size; - byte[][] vector = inputColVector.vector; - int[] length = inputColVector.length; - int[] start = inputColVector.start; - byte[] simplePatternBytes = simplePattern.getBytes(); - - // return immediately if batch is empty - if (n == 0) { - return; - } - - if (inputColVector.noNulls) { - if (inputColVector.isRepeating) { - - // All must be selected otherwise size would be zero Repeating property will not change. - if (!like(vector[0], start[0], length[0])) { - - // Entire batch is filtered out. - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - - switch (type) { - case NONE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case BEGIN: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case END: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case MIDDLE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case COMPLEX: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - break; - } - - batch.size = newSize; - } else { - int newSize = 0; - - switch (type) { - case NONE: - for (int i = 0; i != n; i++) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case BEGIN: - for (int i = 0; i != n; i++) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case END: - for (int i = 0; i != n; i++) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case MIDDLE: - for (int i = 0; i != n; i++) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case COMPLEX: - for (int i = 0; i != n; i++) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - break; - } - - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - } - } else { - if (inputColVector.isRepeating) { + private static class LikeNoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+"); - //All must be selected otherwise size would be zero. Repeating property will not change. - if (!nullPos[0]) { - if (!like(vector[0], start[0], length[0])) { - - //Entire batch is filtered out. - batch.size = 0; - } - } else { - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - - switch (type) { - case NONE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case BEGIN: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case END: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case MIDDLE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case COMPLEX: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - break; - } - - //Change the selected vector - batch.size = newSize; - } else { - int newSize = 0; - - switch (type) { - case NONE: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case BEGIN: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case END: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case MIDDLE: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case COMPLEX: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - break; - } - - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - - /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline - * future operations. So selectedInUse will remain false. - */ + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(pattern); } + return null; } } - @Override - public int getOutputColumn() { - return -1; - } - - @Override - public String getOutputType() { - return "boolean"; + private static class LikeComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(UDFLike.likePatternToRegExp(pattern)); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java new file mode 100644 index 0000000..be624b8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java @@ -0,0 +1,225 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Evaluate REGEXP filter on a batch for a vector of strings. + */ +public class FilterStringColRegExpStringScalar extends AbstractFilterStringColLikeStringScalar { + private static final String LITERAL_CHAR = "[^\\[\\]\\\\(){}*?+|$^.]"; + private static final String LITERAL_CHAR_OR_DOT = "[^\\[\\]\\\\(){}*?+|$^]"; + private static final String LITERAL_CHAR_GROUP = "(" + LITERAL_CHAR + "+)"; + + private static List checkerFactories = Arrays.asList( + new RegExpBeginCheckerFactory(), + new RegExpEndCheckerFactory(), + new RegExpMiddleCheckerFactory(), + new RegExpAnyCharCheckerFactory(), + new PhoneNumberCheckerFactory(), + new RegExpNoneCheckerFactory(), + new RegExpComplexCheckerFactory()); + + public FilterStringColRegExpStringScalar(int colNum, Text regExpPattern) { + super(colNum, regExpPattern); + } + + @Override + protected List getCheckerFactories() { + return checkerFactories; + } + + private static class RegExpBeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP + "\\.\\*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); + } + return null; + } + } + + private static class RegExpEndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP); + + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); + } + return null; + } + } + + private static class RegExpMiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP + "\\.\\*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); + } + return null; + } + } + + private static class PhoneNumberChecker implements Checker { + private boolean twoGroups; + private boolean threeGroups; + + public PhoneNumberChecker(boolean twoGroups, boolean threeGroups) { + this.twoGroups = twoGroups; + this.threeGroups = threeGroups; + } + + public boolean check(byte[] byteS, int start, int len) { + int end = start + len; + int state = 0; + for (int i = start; i < end; i++) { + char c = (char) byteS[i]; + /** + * + state 0 + * V + * + input [0-9] + * V + * + state 1 <-+ + * V | + * +-----------+ input [0-9] + * V + * + input - + * V + * + state 2 + * V + * + input [0-9] + * V + * + state 3 <-+ + * V | + * +-----------+ input [0-9] + * V + * + input - + * V + * + state 4 + * V + * + input [0-9] + * V + * + state 5 <-+ + * V | + * +-----------+ input [0-9] + */ + switch (state) { + + // [0-9]: next + // other: false + case 0: + case 2: + case 4: + if ('0' <= c && c <= '9') { + state++; + } else { + return false; + } + break; + + // [0-9]: self + // -: next + // other: false + case 1: + case 3: + if ('0' <= c && c <= '9') { + // self + } else if (c == '-') { + state++; + } else { + return false; + } + break; + + // [0-9]: self + // other: false + case 5: + if ('0' <= c && c <= '9') { + // self + } else { + return false; + } + } + } + if (twoGroups && state == 3) { + return true; + } + if (threeGroups && state == 5) { + return true; + } + return false; + } + } + + private static class PhoneNumberCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + if (pattern.equals("[0-9]+-[0-9]+")) { + return new PhoneNumberChecker(true, false); + } + if (pattern.equals("[0-9]+-[0-9]+-[0-9]+")) { + return new PhoneNumberChecker(false, true); + } + if (pattern.equals("([0-9]+-)?[0-9]+-[0-9]+")) { + return new PhoneNumberChecker(true, true); + } + return null; + } + } + + private static class RegExpAnyCharCheckerFactory implements CheckerFactory { + private static final Pattern ANY_CHAR_PATTERN = Pattern.compile(LITERAL_CHAR + "*\\." + LITERAL_CHAR_OR_DOT + "*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = ANY_CHAR_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new AnyCharChecker('.', pattern); + } + return null; + } + } + + private static class RegExpNoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP); + + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(pattern); + } + return null; + } + } + + private static class RegExpComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(pattern); + } + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 973a155..cf4d876 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -51,7 +51,9 @@ private static byte[] mixedUpLower; private static byte[] mixedUpUpper; private static byte[] multiByte; - private static byte[] mixPercentPattern; + private static byte[] shortPhoneNumber; + private static byte[] longPhoneNumber; + private static byte[] incompletePhoneNumber; static { try { @@ -66,8 +68,10 @@ mixedUp = "mixedUp".getBytes("UTF-8"); mixedUpLower = "mixedup".getBytes("UTF-8"); mixedUpUpper = "MIXEDUP".getBytes("UTF-8"); - mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE multiByte = new byte[100]; + shortPhoneNumber = "0123-4567".getBytes("UTF-8"); + longPhoneNumber = "012-3456-7890".getBytes("UTF-8"); + incompletePhoneNumber = "012-3456-".getBytes("UTF-8"); addMultiByteChars(multiByte); } catch (UnsupportedEncodingException e) { e.printStackTrace(); @@ -577,7 +581,7 @@ public void testStringLike() { Text pattern; int initialBatchSize; batch = makeStringBatchMixedCharSize(); - pattern = new Text(mixPercentPattern); + pattern = new Text("mix%"); FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern); expr.evaluate(batch); @@ -623,50 +627,234 @@ public void testStringLike() { Assert.assertEquals(initialBatchSize, batch.size); } + @Test public void testStringLikePatternType() { FilterStringColLikeStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColLikeStringScalar(0, new Text("mixed%")); + Assert.assertEquals("BeginChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%Up")); + Assert.assertEquals("EndChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%xed%")); + Assert.assertEquals("MiddleChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // AnyCharChecker + expr = new FilterStringColLikeStringScalar(0, new Text("m_x_d_p")); + Assert.assertEquals("AnyCharChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // ComplexChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%ix%Up")); + Assert.assertEquals("ComplexChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); - // BEGIN pattern - expr = new FilterStringColLikeStringScalar(0, new Text("abc%")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN, - expr.getType()); - - // END pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END, - expr.getType()); - - // MIDDLE pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc%")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE, - expr.getType()); - - // COMPLEX pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX, - expr.getType()); - - // NONE pattern - expr = new FilterStringColLikeStringScalar(0, new Text("abc")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE, - expr.getType()); + // NoneChecker + expr= new FilterStringColLikeStringScalar(0, new Text("mixedUp")); + Assert.assertEquals("NoneChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); } - public void testStringLikeMultiByte() { + @Test + public void testStringLikeMultiByte() throws UnsupportedEncodingException { FilterStringColLikeStringScalar expr; VectorizedRowBatch batch; + byte[] percentBytes = "%".getBytes("UTF-8"); + byte[] xBytes = "x".getBytes("UTF-8"); // verify that a multi byte LIKE expression matches a matching string batch = makeStringBatchMixedCharSize(); - expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "%")); + Text text = new Text(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(percentBytes, 0, percentBytes.length); + expr = new FilterStringColLikeStringScalar(0, text); expr.evaluate(batch); - Assert.assertEquals(batch.size, 1); + Assert.assertEquals(1, batch.size); // verify that a multi byte LIKE expression doesn't match a non-matching string + text.clear(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(xBytes, 0, xBytes.length); batch = makeStringBatchMixedCharSize(); - expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "x")); + expr = new FilterStringColLikeStringScalar(0, text); expr.evaluate(batch); - Assert.assertEquals(batch.size, 0); + Assert.assertEquals(0, batch.size); + } + + @Test + public void testStringRegExp() { + + // has nulls, not repeating + VectorizedRowBatch batch; + Text pattern; + int initialBatchSize; + batch = makeStringBatchMixedCharSize(); + pattern = new Text("mix.*"); + FilterStringColRegExpStringScalar expr = new FilterStringColRegExpStringScalar(0, pattern); + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + + // same, but repeating value is null + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + batch.cols[0].isNull[0] = true; + expr.evaluate(batch); + + // no rows qualify + Assert.assertEquals(0, batch.size); + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + } + + @Test + public void testStringRegExpPatternType() { + FilterStringColRegExpStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixed.*")); + Assert.assertEquals("BeginChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*Up")); + Assert.assertEquals("EndChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*xed.*")); + Assert.assertEquals("MiddleChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // AnyCharChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("m.x.d.p")); + Assert.assertEquals("AnyCharChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // ComplexChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*ix.*Up")); + Assert.assertEquals("ComplexChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // PhoneNumberChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("([0-9]+-)?[0-9]+-[0-9]+")); + Assert.assertEquals("PhoneNumberChecker", expr.getCheckerName()); + batch = makeStringBatchPhoneNumber(); + expr.evaluate(batch); + Assert.assertEquals(2, batch.size); + Assert.assertEquals(0, batch.selected[0]); + Assert.assertEquals(1, batch.selected[1]); + + // NoneChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixedUp")); + Assert.assertEquals("NoneChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + } + + private VectorizedRowBatch makeStringBatchPhoneNumber() { + + // create a new batch with one char column (for input) and one long column (for output) + VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[1] = outV; + + /* + * Add these 3 values: + * + * mixedUp + * green + * NULL + * <4 char string with multi-byte chars> + */ + v.setRef(0, shortPhoneNumber, 0, shortPhoneNumber.length); + v.isNull[0] = false; + v.setRef(1, longPhoneNumber, 0, longPhoneNumber.length); + v.isNull[1] = false; + v.setRef(2, incompletePhoneNumber, 0, incompletePhoneNumber.length); + v.isNull[2] = true; + v.noNulls = false; + v.setRef(3, green, 0, green.length); + v.isNull[3] = false; + + batch.size = 4; + return batch; } @Test