diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java new file mode 100644 index 0000000..47ecf79 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java @@ -0,0 +1,414 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.io.Text; + +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * An abstract class for LIKE and REGEXP expressions. LIKE and REGEXP expression share similar + * functions, but they have different grammars. AbstractFilterStringColLikeStringScalar class + * provides shared classes and methods. Each subclass handles its grammar. + */ +public abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression { + private static final long serialVersionUID = 1L; + + private int colNum; + private String pattern; + transient Checker checker; + + public AbstractFilterStringColLikeStringScalar() { + super(); + } + + public AbstractFilterStringColLikeStringScalar(int colNum, Text pattern) { + super(); + setColNum(colNum); + setPattern(pattern.toString()); + } + + protected abstract List getCheckerFactories(); + + /** + * Selects an optimized checker for a given string. + * @param pattern + * @return + */ + private Checker createChecker(String pattern) { + for (CheckerFactory checkerFactory : getCheckerFactories()) { + Checker checker = checkerFactory.tryCreate(pattern); + if (checker != null) { + return checker; + } + } + return null; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + checker = createChecker(pattern); + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero Repeating property will not change. + if (!checker.check(vector[0], start[0], length[0])) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + + batch.size = newSize; + } else { + int newSize = 0; + for (int i = 0; i != n; i++) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + //All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + if (!checker.check(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + + /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline + * future operations. So selectedInUse will remain false. + */ + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } + + /** + * A Checker contains a pattern and checks whether a given string matches or not. + */ + public interface Checker { + /** + * Checks whether the given string matches with its pattern. + * @param byteS The byte array that contains the string + * @param start The start position of the string + * @param len The length of the string + * @return Whether it matches or not. + */ + boolean check(byte[] byteS, int start, int len); + } + + /** + * A CheckerFactory creates checkers of its kind. + */ + protected interface CheckerFactory { + /** + * If the given pattern is acceptable for its checker class, it creates and returns a checker. + * Otherwise, it returns null. + * @param pattern + * @return If the pattern is acceptable, a Checker object. Otherwise + * null. + */ + Checker tryCreate(String pattern); + } + + /** + * Matches the whole string to its pattern. + */ + protected static class NoneChecker implements Checker { + byte [] byteSub; + + NoneChecker(String pattern) { + try { + byteSub = pattern.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + public boolean check(byte[] byteS, int start, int len) { + int lenSub = byteSub.length; + if (len != lenSub) { + return false; + } + for (int i = start, j = 0; j < len; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + /** + * Matches the beginning of each string to a pattern. + */ + protected static class BeginChecker implements Checker { + byte[] byteSub; + + BeginChecker(String pattern) { + try { + byteSub = pattern.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + public boolean check(byte[] byteS, int start, int len) { + if (len < byteSub.length) { + return false; + } + for (int i = start, j = 0; j < byteSub.length; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + /** + * Matches the ending of each string to its pattern. + */ + protected static class EndChecker implements Checker { + byte[] byteSub; + + EndChecker(String pattern) { + try { + byteSub = pattern.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + public boolean check(byte[] byteS, int start, int len) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + /** + * Matches the middle of each string to its pattern. + */ + protected static class MiddleChecker implements Checker { + byte[] byteSub; + int lenSub; + + MiddleChecker(String pattern) { + try { + byteSub = pattern.getBytes("UTF-8"); + lenSub = byteSub.length; + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + public boolean check(byte[] byteS, int start, int len) { + if (len < lenSub) { + return false; + } + int end = start + len - lenSub + 1; + boolean match = false; + for (int i = start; i < end; i++) { + match = true; + for (int j = 0; j < lenSub; j++) { + if (byteS[i + j] != byteSub[j]) { + match = false; + break; + } + } + if (match) { + return true; + } + } + return match; + } + } + + /** + * Matches each string to a pattern with Java regular expression package. + */ + protected static class ComplexChecker implements Checker { + Pattern compiledPattern; + Matcher matcher; + FastUTF8Decoder decoder; + + ComplexChecker(String pattern) { + compiledPattern = Pattern.compile(pattern); + matcher = compiledPattern.matcher(""); + decoder = new FastUTF8Decoder(); + } + + public boolean check(byte[] byteS, int start, int len) { + // Match the given bytes with the like pattern + matcher.reset(decoder.decodeUnsafely(byteS, start, len)); + return matcher.matches(); + } + } + + /** + * A fast UTF-8 decoder that caches necessary objects for decoding. + */ + protected static class FastUTF8Decoder { + CharsetDecoder decoder; + ByteBuffer byteBuffer; + CharBuffer charBuffer; + + FastUTF8Decoder() { + decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + byteBuffer = ByteBuffer.allocate(4); + charBuffer = CharBuffer.allocate(4); + } + + CharBuffer decodeUnsafely(byte[] byteS, int start, int len) { + // Prepare buffers + if (byteBuffer.capacity() < len) { + byteBuffer = ByteBuffer.allocate(len * 2); + } + byteBuffer.clear(); + byteBuffer.put(byteS, start, len); + byteBuffer.flip(); + + int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); + if (charBuffer.capacity() < maxChars) { + charBuffer = CharBuffer.allocate(maxChars); + } + charBuffer.clear(); + + // Decode UTF-8 + decoder.reset(); + decoder.decode(byteBuffer, charBuffer, true); + decoder.flush(charBuffer); + charBuffer.flip(); + + return charBuffer; + } + } + + public int getColNum() { + return colNum; + } + + public void setColNum(int colNum) { + this.colNum = colNum; + } + + public String getPattern() { + return pattern; + } + + public void setPattern(String pattern) { + this.pattern = pattern; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index f26c211..c17a51b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -18,552 +18,106 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; -import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; +import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.io.Text; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.io.Text; - /** * Evaluate LIKE filter on a batch for a vector of strings. */ -public class FilterStringColLikeStringScalar extends VectorExpression { +public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar { private static final long serialVersionUID = 1L; - private int colNum; - private Pattern compiledPattern; - private PatternType type = PatternType.NONE; - private String simpleStringPattern; - private final Text simplePattern = new Text(); - - private transient ByteBuffer byteBuffer; - private transient CharBuffer charBuffer; - private transient final CharsetDecoder decoder; - - // Doing characters comparison directly instead of regular expression - // matching for simple patterns like "%abc%". - public enum PatternType { - NONE, // "abc" - BEGIN, // "abc%" - END, // "%abc" - MIDDLE, // "%abc%" - COMPLEX, // all other cases, such as "ab%c_de" - } - public FilterStringColLikeStringScalar(int colNum, Text likePattern) { - this(); - this.colNum = colNum; - String stringLikePattern = likePattern.toString(); - parseSimplePattern(stringLikePattern); - if (type == PatternType.COMPLEX) { - compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern)); - } - } + private transient final static List CHECKER_FACTORIES = Arrays.asList( + new BeginCheckerFactory(), + new EndCheckerFactory(), + new MiddleCheckerFactory(), + new NoneCheckerFactory(), + new ComplexCheckerFactory()); public FilterStringColLikeStringScalar() { super(); - decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - byteBuffer = ByteBuffer.allocate(4); - charBuffer = CharBuffer.allocate(4); } - public PatternType getType() { - return type; + public FilterStringColLikeStringScalar(int colNum, Text likePattern) { + super(colNum, likePattern); } - private boolean like(byte[] bytes, int start, int len) { - switch (type) { - case NONE: - return noneLike(bytes, start, len, simplePattern.getBytes()); - case BEGIN: - return beginLike(bytes, start, len, simplePattern.getBytes()); - case END: - return endLike(bytes, start, len, simplePattern.getBytes()); - case MIDDLE: - return midLike(bytes, start, len, simplePattern.getBytes()); - case COMPLEX: - return complexLike(bytes, start, len); - default: - return false; - } + @Override + protected List getCheckerFactories() { + return CHECKER_FACTORIES; } - private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len != lenSub) { - return false; - } - for (int i = start, j = 0; j < len; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; - } - } - return true; - } + /** + * Accepts simple LIKE patterns like "abc%" and creates corresponding checkers. + */ + private static class BeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%"); - private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) { - if (len < byteSub.length) { - return false; - } - for (int i = start, j = 0; j < byteSub.length; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); } + return null; } - return true; } - private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len < lenSub) { - return false; - } - for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; - } - } - return true; - } + /** + * Accepts simple LIKE patterns like "%abc" and creates a corresponding checkers. + */ + private static class EndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)"); - private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len < lenSub) { - return false; - } - int end = start + len - lenSub + 1; - boolean match = false; - for (int i = start; (i < end) && (!match); i++) { - match = true; - for (int j = 0; j < lenSub; j++) { - if (byteS[i + j] != byteSub[j]) { - match = false; - break; - } + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); } + return null; } - return match; } /** - * Matches the byte array against the complex like pattern. This method uses - * {@link #compiledPattern} to match. For decoding performance, it caches - * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}. - * When the length to decode is greater than the capacity of - * {@link #byteBuffer}, it creates new {@link #byteBuffer} and - * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the - * double of the length, for fewer object creations and higher memory - * utilization. - * - * @param byteS - * A byte array that contains a UTF-8 string. - * @param start - * A position to start decoding. - * @param len - * A length to decode. - * @return - * true if the byte array matches the complex like pattern, - * otherwise false. + * Accepts simple LIKE patterns like "%abc%" and creates a corresponding checkers. */ - private boolean complexLike(byte[] byteS, int start, int len) { - // Prepare buffers - if (byteBuffer.capacity() < len) { - byteBuffer = ByteBuffer.allocate(len * 2); - } - byteBuffer.clear(); - byteBuffer.put(byteS, start, len); - byteBuffer.flip(); + private static class MiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%"); - int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); - if (charBuffer.capacity() < maxChars) { - charBuffer = CharBuffer.allocate(maxChars); + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); + } + return null; } - charBuffer.clear(); - - // Decode UTF-8 - decoder.reset(); - decoder.decode(byteBuffer, charBuffer, true); - decoder.flush(charBuffer); - charBuffer.flip(); - - // Match the given bytes with the like pattern - return compiledPattern.matcher(charBuffer).matches(); } /** - * Parses the likePattern. Based on it is a simple pattern or not, the - * function might change two member variables. {@link #type} will be changed - * to the corresponding pattern type; {@link #simplePattern} will record the - * string in it for later pattern matching if it is a simple pattern. - *

- * Examples:

- * - *
-   * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
-   * and changes {@link #simplePattern} to "abc"
-   * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
-   * and does not change {@link #simplePattern}
-   * 
- * - *
- * - * @param likePattern - * the input LIKE query pattern + * Accepts simple LIKE patterns like "abc" and creates corresponding checkers. */ - private void parseSimplePattern(String likePattern) { - int length = likePattern.length(); - int beginIndex = 0; - int endIndex = length; - char lastChar = 'a'; - String strPattern = new String(); - type = PatternType.NONE; + private static class NoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+"); - for (int i = 0; i < length; i++) { - char n = likePattern.charAt(i); - if (n == '_') { // such as "a_b" - if (lastChar != '\\') { // such as "a%bc" - type = PatternType.COMPLEX; - return; - } else { // such as "abc\%de%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - } - } else if (n == '%') { - if (i == 0) { // such as "%abc" - type = PatternType.END; - beginIndex = 1; - } else if (i < length - 1) { - if (lastChar != '\\') { // such as "a%bc" - type = PatternType.COMPLEX; - return; - } else { // such as "abc\%de%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - } - } else { - if (lastChar != '\\') { - endIndex = length - 1; - if (type == PatternType.END) { // such as "%abc%" - type = PatternType.MIDDLE; - } else { - type = PatternType.BEGIN; // such as "abc%" - } - } else { // such as "abc\%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - endIndex = length; - } - } + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(pattern); } - lastChar = n; + return null; } - - strPattern += likePattern.substring(beginIndex, endIndex); - simpleStringPattern = strPattern; - simplePattern.set(simpleStringPattern); } - @Override - public void evaluate(VectorizedRowBatch batch) { - - if (childExpressions != null) { - super.evaluateChildren(batch); - } - - BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; - int[] sel = batch.selected; - boolean[] nullPos = inputColVector.isNull; - int n = batch.size; - byte[][] vector = inputColVector.vector; - int[] length = inputColVector.length; - int[] start = inputColVector.start; - byte[] simplePatternBytes = simplePattern.getBytes(); - - // return immediately if batch is empty - if (n == 0) { - return; - } - - if (inputColVector.noNulls) { - if (inputColVector.isRepeating) { - - // All must be selected otherwise size would be zero Repeating property will not change. - if (!like(vector[0], start[0], length[0])) { - - // Entire batch is filtered out. - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - - switch (type) { - case NONE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case BEGIN: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case END: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case MIDDLE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case COMPLEX: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - break; - } - - batch.size = newSize; - } else { - int newSize = 0; - - switch (type) { - case NONE: - for (int i = 0; i != n; i++) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case BEGIN: - for (int i = 0; i != n; i++) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case END: - for (int i = 0; i != n; i++) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case MIDDLE: - for (int i = 0; i != n; i++) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case COMPLEX: - for (int i = 0; i != n; i++) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - break; - } - - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - } - } else { - if (inputColVector.isRepeating) { - - //All must be selected otherwise size would be zero. Repeating property will not change. - if (!nullPos[0]) { - if (!like(vector[0], start[0], length[0])) { - - //Entire batch is filtered out. - batch.size = 0; - } - } else { - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - - switch (type) { - case NONE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case BEGIN: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case END: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case MIDDLE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case COMPLEX: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - break; - } - - //Change the selected vector - batch.size = newSize; - } else { - int newSize = 0; - - switch (type) { - case NONE: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case BEGIN: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case END: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case MIDDLE: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case COMPLEX: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - break; - } - - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - - /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline - * future operations. So selectedInUse will remain false. - */ - } + /** + * Accepts any LIKE patterns and creates corresponding checkers. + */ + private static class ComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(UDFLike.likePatternToRegExp(pattern)); } } - - @Override - public int getOutputColumn() { - return -1; - } - - @Override - public String getOutputType() { - return "boolean"; - } - - public int getColNum() { - return colNum; - } - - public void setColNum(int colNum) { - this.colNum = colNum; - } - - public Pattern getCompiledPattern() { - return compiledPattern; - } - - public void setCompiledPattern(Pattern compiledPattern) { - this.compiledPattern = compiledPattern; - } - - public void setType(PatternType type) { - this.type = type; - } - - public String getSimpleStringPattern() { - return simpleStringPattern; - } - - public void setSimpleStringPattern(String simpleStringPattern) { - this.simpleStringPattern = simpleStringPattern; - simplePattern.set(simpleStringPattern); - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java new file mode 100644 index 0000000..d9e2cad --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java @@ -0,0 +1,182 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Evaluate REGEXP filter on a batch for a vector of strings. + */ +public class FilterStringColRegExpStringScalar extends AbstractFilterStringColLikeStringScalar { + private static final long serialVersionUID = 1L; + + private static final String LITERAL_CHAR = "[^\\[\\]\\\\(){}*?+|$^.]"; + private static final String LITERAL_CHAR_GROUP = "(" + LITERAL_CHAR + "+)"; + + private transient final static List CHECKER_FACTORIES = Arrays.asList( + new BeginCheckerFactory(), + new EndCheckerFactory(), + new MiddleCheckerFactory(), + new PhoneNumberCheckerFactory(), + new NoneCheckerFactory(), + new ComplexCheckerFactory()); + + public FilterStringColRegExpStringScalar() { + super(); + } + + public FilterStringColRegExpStringScalar(int colNum, Text regExpPattern) { + super(colNum, regExpPattern); + } + + @Override + protected List getCheckerFactories() { + return CHECKER_FACTORIES; + } + + /** + * Accepts simple REGEXP patterns like "abc.*" and creates corresponding checkers. + */ + private static class BeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP + "\\.\\*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); + } + return null; + } + } + + /** + * Accepts simple REGEXP patterns like ".*abc" and creates corresponding checkers. + */ + private static class EndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP); + + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); + } + return null; + } + } + + /** + * Accepts simple REGEXP patterns like ".*abc.*" and creates corresponding checkers. + */ + private static class MiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP + "\\.\\*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); + } + return null; + } + } + + /** + * Accepts simple phone number regular expressions consisted only with "\(", "\)", "-", " ", "\d". + * For example, it accepts "(\d\d\d) \d\d\d-\d\d\d\d" then matches "(012) 345-6789". + */ + protected static class PhoneNumberChecker implements Checker { + byte[] byteSub; + + PhoneNumberChecker(String pattern) { + this.byteSub = pattern.getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + for (int i = 0; i < len; i++) { + byte c = byteS[start + i]; + byte p = byteSub[i]; + switch (p) { + // For pattern 'd', find digits. + case 'd': + if (!('0' <= c && c <= '9')) { + return false; + } + break; + + // For other registered patterns, find exact matches. + case '-': + case ' ': + case '(': + case ')': + if (c != p) { + return false; + } + break; + + // For unregistered patterns, fail. + default: + return false; + } + } + return true; + } + } + + /** + * Accepts phone number REGEXP patterns like "\(\d\d\d\) \d\d\d-\d\d\d\d" and creates + * corresponding checkers. + */ + private static class PhoneNumberCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + if (pattern.matches("(\\\\d|\\\\\\(|\\\\\\)|-| )+")) { + return new PhoneNumberChecker( + pattern.replaceAll("\\\\d", "d").replaceAll("\\\\\\(", "(").replaceAll("\\\\\\)", ")")); + } + return null; + } + } + + /** + * Accepts simple REGEXP patterns like "abc" and creates corresponding checkers. + */ + private static class NoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP); + + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(matcher.group(1)); + } + return null; + } + } + + /** + * Accepts any REGEXP patterns and creates corresponding checkers. + */ + private static class ComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(pattern); + } + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 1d44abf..59d098f 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -54,11 +54,13 @@ private static byte[] mixedUpLower; private static byte[] mixedUpUpper; private static byte[] multiByte; - private static byte[] mixPercentPattern; private static byte[] blanksLeft; private static byte[] blanksRight; private static byte[] blanksBoth; private static byte[] blankString; + private static byte[] shortPhoneNumber; + private static byte[] longPhoneNumber; + private static byte[] incompletePhoneNumber; static { try { @@ -73,8 +75,10 @@ mixedUp = "mixedUp".getBytes("UTF-8"); mixedUpLower = "mixedup".getBytes("UTF-8"); mixedUpUpper = "MIXEDUP".getBytes("UTF-8"); - mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE multiByte = new byte[100]; + shortPhoneNumber = "0123-4567".getBytes("UTF-8"); + longPhoneNumber = "(012) 345-6789".getBytes("UTF-8"); + incompletePhoneNumber = "012-3456-".getBytes("UTF-8"); addMultiByteChars(multiByte); blanksLeft = " foo".getBytes("UTF-8"); blanksRight = "foo ".getBytes("UTF-8"); @@ -619,7 +623,7 @@ public void testStringLike() { Text pattern; int initialBatchSize; batch = makeStringBatchMixedCharSize(); - pattern = new Text(mixPercentPattern); + pattern = new Text("mix%"); FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern); expr.evaluate(batch); @@ -665,50 +669,231 @@ public void testStringLike() { Assert.assertEquals(initialBatchSize, batch.size); } + @Test public void testStringLikePatternType() { FilterStringColLikeStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColLikeStringScalar(0, new Text("mixed%")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.BeginChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%Up")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.EndChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%xed%")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.MiddleChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); - // BEGIN pattern - expr = new FilterStringColLikeStringScalar(0, new Text("abc%")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN, - expr.getType()); - - // END pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END, - expr.getType()); - - // MIDDLE pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc%")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE, - expr.getType()); - - // COMPLEX pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX, - expr.getType()); - - // NONE pattern - expr = new FilterStringColLikeStringScalar(0, new Text("abc")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE, - expr.getType()); + // ComplexChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%ix%Up")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.ComplexChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // NoneChecker + expr= new FilterStringColLikeStringScalar(0, new Text("mixedUp")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.NoneChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); } - public void testStringLikeMultiByte() { + @Test + public void testStringLikeMultiByte() throws UnsupportedEncodingException { FilterStringColLikeStringScalar expr; VectorizedRowBatch batch; + byte[] percentBytes = "%".getBytes("UTF-8"); + byte[] xBytes = "x".getBytes("UTF-8"); // verify that a multi byte LIKE expression matches a matching string batch = makeStringBatchMixedCharSize(); - expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "%")); + Text text = new Text(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(percentBytes, 0, percentBytes.length); + expr = new FilterStringColLikeStringScalar(0, text); expr.evaluate(batch); - Assert.assertEquals(batch.size, 1); + Assert.assertEquals(1, batch.size); // verify that a multi byte LIKE expression doesn't match a non-matching string + text.clear(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(xBytes, 0, xBytes.length); + batch = makeStringBatchMixedCharSize(); + expr = new FilterStringColLikeStringScalar(0, text); + expr.evaluate(batch); + Assert.assertEquals(0, batch.size); + } + + @Test + public void testStringRegExp() { + + // has nulls, not repeating + VectorizedRowBatch batch; + Text pattern; + int initialBatchSize; + batch = makeStringBatchMixedCharSize(); + pattern = new Text("mix.*"); + FilterStringColRegExpStringScalar expr = new FilterStringColRegExpStringScalar(0, pattern); + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + + // same, but repeating value is null batch = makeStringBatchMixedCharSize(); - expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "x")); + batch.cols[0].isRepeating = true; + batch.cols[0].isNull[0] = true; expr.evaluate(batch); - Assert.assertEquals(batch.size, 0); + + // no rows qualify + Assert.assertEquals(0, batch.size); + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + } + + @Test + public void testStringRegExpPatternType() { + FilterStringColRegExpStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixed.*")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.BeginChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*Up")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.EndChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*xed.*")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.MiddleChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // ComplexChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*ix.*Up")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.ComplexChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // PhoneNumberChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("\\(\\d\\d\\d\\) \\d\\d\\d-\\d\\d\\d\\d")); + Assert.assertEquals( + FilterStringColRegExpStringScalar.PhoneNumberChecker.class, + expr.checker.getClass()); + batch = makeStringBatchPhoneNumber(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(1, batch.selected[0]); + + // NoneChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixedUp")); + Assert.assertEquals( + AbstractFilterStringColLikeStringScalar.NoneChecker.class, + expr.checker.getClass()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + } + + private VectorizedRowBatch makeStringBatchPhoneNumber() { + + // create a new batch with one char column (for input) and one long column (for output) + VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[1] = outV; + + v.setRef(0, shortPhoneNumber, 0, shortPhoneNumber.length); + v.isNull[0] = false; + v.setRef(1, longPhoneNumber, 0, longPhoneNumber.length); + v.isNull[1] = false; + v.setRef(2, incompletePhoneNumber, 0, incompletePhoneNumber.length); + v.isNull[2] = true; + v.noNulls = false; + v.setRef(3, green, 0, green.length); + v.isNull[3] = false; + + batch.size = 4; + return batch; } @Test