null.
@@ -210,16 +214,10 @@ public String getOutputType() {
/**
* Matches the whole string to its pattern.
*/
- protected static class NoneChecker implements Checker {
- byte [] byteSub;
-
- public NoneChecker(String pattern) {
- try {
- byteSub = pattern.getBytes("UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(e);
- }
- }
+ public static class NoneChecker implements Checker {
+ private static final long serialVersionUID = 1L;
+ private String pattern;
+ private transient byte [] byteSub;
public boolean check(byte[] byteS, int start, int len) {
int lenSub = byteSub.length;
@@ -233,15 +231,9 @@ public boolean check(byte[] byteS, int start, int len) {
}
return true;
}
- }
- /**
- * Matches the beginning of each string to a pattern.
- */
- protected static class BeginChecker implements Checker {
- byte [] byteSub;
-
- public BeginChecker(String pattern) {
+ public void setPattern(String pattern) {
+ this.pattern = pattern;
try {
byteSub = pattern.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
@@ -249,6 +241,19 @@ public BeginChecker(String pattern) {
}
}
+ public String getPattern() {
+ return pattern;
+ }
+ }
+
+ /**
+ * Matches the beginning of each string to a pattern.
+ */
+ public static class BeginChecker implements Checker {
+ private static final long serialVersionUID = 1L;
+ private String pattern;
+ private transient byte[] byteSub;
+
public boolean check(byte[] byteS, int start, int len) {
if (len < byteSub.length) {
return false;
@@ -260,15 +265,33 @@ public boolean check(byte[] byteS, int start, int len) {
}
return true;
}
+
+ @Override
+ public void setPattern(String pattern) {
+ this.pattern = pattern;
+ try {
+ byteSub = pattern.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public String getPattern() {
+ return pattern;
+ }
}
/**
* Matches the ending of each string to its pattern.
*/
- protected static class EndChecker implements Checker {
- byte [] byteSub;
+ public static class EndChecker implements Checker {
+ private static final long serialVersionUID = 1L;
+ private String pattern;
+ private transient byte[] byteSub;
- public EndChecker(String pattern) {
+ public void setPattern(String pattern) {
+ this.pattern = pattern;
try {
byteSub = pattern.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
@@ -276,6 +299,10 @@ public EndChecker(String pattern) {
}
}
+ public String getPattern() {
+ return pattern;
+ }
+
public boolean check(byte[] byteS, int start, int len) {
int lenSub = byteSub.length;
if (len < lenSub) {
@@ -288,23 +315,17 @@ public boolean check(byte[] byteS, int start, int len) {
}
return true;
}
+
}
/**
* Matches the middle of each string to its pattern.
*/
- protected static class MiddleChecker implements Checker {
- byte [] byteSub;
- int lenSub;
-
- public MiddleChecker(String pattern) {
- try {
- byteSub = pattern.getBytes("UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(e);
- }
- lenSub = byteSub.length;
- }
+ public static class MiddleChecker implements Checker {
+ private static final long serialVersionUID = 1L;
+ private String pattern;
+ private transient byte[] byteSub;
+ private transient int lenSub;
public boolean check(byte[] byteS, int start, int len) {
if (len < lenSub) {
@@ -326,37 +347,58 @@ public boolean check(byte[] byteS, int start, int len) {
}
return match;
}
+
+ public void setPattern(String pattern) {
+ this.pattern = pattern;
+ try {
+ byteSub = pattern.getBytes("UTF-8");
+ lenSub = byteSub.length;
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public String getPattern() {
+ return pattern;
+ }
}
/**
* Matches each string to a pattern with Java regular expression package.
*/
- protected static class ComplexChecker implements Checker {
- private final Pattern compiledPattern;
- private final Matcher matcher;
- private final FastUTF8Decoder decoder;
-
- public ComplexChecker(String regExpPattern) {
- compiledPattern = Pattern.compile(regExpPattern);
- matcher = compiledPattern.matcher("");
- decoder = new FastUTF8Decoder();
- }
+ public static class ComplexChecker implements Checker {
+ private static final long serialVersionUID = 1L;
+ private String pattern;
+ private transient Pattern compiledPattern;
+ private transient Matcher matcher;
+ private transient FastUTF8Decoder decoder;
public boolean check(byte[] byteS, int start, int len) {
// Match the given bytes with the like pattern
matcher.reset(decoder.decodeUnsafely(byteS, start, len));
return matcher.matches();
}
+
+ public void setPattern(String pattern) {
+ this.pattern = pattern;
+ compiledPattern = Pattern.compile(pattern);
+ matcher = compiledPattern.matcher("");
+ decoder = new FastUTF8Decoder();
+ }
+
+ public String getPattern() {
+ return pattern;
+ }
}
/**
* A fast UTF-8 decoder that caches necessary objects for decoding.
*/
- protected static class FastUTF8Decoder {
- private final CharsetDecoder decoder;
-
- private ByteBuffer byteBuffer;
- private CharBuffer charBuffer;
+ public static class FastUTF8Decoder {
+ private static final long serialVersionUID = 1L;
+ private transient CharsetDecoder decoder;
+ private transient ByteBuffer byteBuffer;
+ private transient CharBuffer charBuffer;
public FastUTF8Decoder() {
decoder = Charset.forName("UTF-8").newDecoder()
@@ -391,11 +433,19 @@ public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) {
}
}
- /**
- * Returns inner checker name. It is for test purpose only.
- * @return
- */
- String getCheckerName() {
- return checker.getClass().getSimpleName();
+ public Checker getChecker() {
+ return checker;
+ }
+
+ public void setChecker(Checker checker) {
+ this.checker = checker;
+ }
+
+ public int getColNum() {
+ return colNum;
+ }
+
+ public void setColNum(int colNum) {
+ this.colNum = colNum;
}
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
index eb3e511..015ed8d 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
@@ -18,552 +18,112 @@
package org.apache.hadoop.hive.ql.exec.vector.expressions;
-import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.io.Text;
-
/**
* Evaluate LIKE filter on a batch for a vector of strings.
*/
-public class FilterStringColLikeStringScalar extends VectorExpression {
+public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar {
private static final long serialVersionUID = 1L;
- private int colNum;
- private Pattern compiledPattern;
- private PatternType type = PatternType.NONE;
- private String simpleStringPattern;
- private transient Text simplePattern = new Text();
- private transient ByteBuffer byteBuffer;
- private transient CharBuffer charBuffer;
- private transient CharsetDecoder decoder;
-
- // Doing characters comparison directly instead of regular expression
- // matching for simple patterns like "%abc%".
- public enum PatternType {
- NONE, // "abc"
- BEGIN, // "abc%"
- END, // "%abc"
- MIDDLE, // "%abc%"
- COMPLEX, // all other cases, such as "ab%c_de"
- }
+ private transient static List- * Examples:
- *
- *
- * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
- * and changes {@link #simplePattern} to "abc"
- * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
- * and does not change {@link #simplePattern}
- *
- *
- *
- *
- * @param likePattern
- * the input LIKE query pattern
+ * Accepts simple LIKE patterns like "abc" and creates corresponding checkers.
*/
- private void parseSimplePattern(String likePattern) {
- int length = likePattern.length();
- int beginIndex = 0;
- int endIndex = length;
- char lastChar = 'a';
- String strPattern = new String();
- type = PatternType.NONE;
-
- for (int i = 0; i < length; i++) {
- char n = likePattern.charAt(i);
- if (n == '_') { // such as "a_b"
- if (lastChar != '\\') { // such as "a%bc"
- type = PatternType.COMPLEX;
- return;
- } else { // such as "abc\%de%"
- strPattern += likePattern.substring(beginIndex, i - 1);
- beginIndex = i;
- }
- } else if (n == '%') {
- if (i == 0) { // such as "%abc"
- type = PatternType.END;
- beginIndex = 1;
- } else if (i < length - 1) {
- if (lastChar != '\\') { // such as "a%bc"
- type = PatternType.COMPLEX;
- return;
- } else { // such as "abc\%de%"
- strPattern += likePattern.substring(beginIndex, i - 1);
- beginIndex = i;
- }
- } else {
- if (lastChar != '\\') {
- endIndex = length - 1;
- if (type == PatternType.END) { // such as "%abc%"
- type = PatternType.MIDDLE;
- } else {
- type = PatternType.BEGIN; // such as "abc%"
- }
- } else { // such as "abc\%"
- strPattern += likePattern.substring(beginIndex, i - 1);
- beginIndex = i;
- endIndex = length;
- }
- }
+ private static class NoneCheckerFactory implements CheckerFactory {
+ private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+");
+
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = NONE_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ NoneChecker checker = new NoneChecker();
+ checker.setPattern(pattern);
+ return checker;
}
- lastChar = n;
+ return null;
}
-
- strPattern += likePattern.substring(beginIndex, endIndex);
- simpleStringPattern = strPattern;
- simplePattern.set(simpleStringPattern);
}
- @Override
- public void evaluate(VectorizedRowBatch batch) {
-
- if (childExpressions != null) {
- super.evaluateChildren(batch);
- }
-
- BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
- int[] sel = batch.selected;
- boolean[] nullPos = inputColVector.isNull;
- int n = batch.size;
- byte[][] vector = inputColVector.vector;
- int[] length = inputColVector.length;
- int[] start = inputColVector.start;
- byte[] simplePatternBytes = simplePattern.getBytes();
-
- // return immediately if batch is empty
- if (n == 0) {
- return;
- }
-
- if (inputColVector.noNulls) {
- if (inputColVector.isRepeating) {
-
- // All must be selected otherwise size would be zero Repeating property will not change.
- if (!like(vector[0], start[0], length[0])) {
-
- // Entire batch is filtered out.
- batch.size = 0;
- }
- } else if (batch.selectedInUse) {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case BEGIN:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case END:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case MIDDLE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case COMPLEX:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- break;
- }
-
- batch.size = newSize;
- } else {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int i = 0; i != n; i++) {
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case BEGIN:
- for (int i = 0; i != n; i++) {
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case END:
- for (int i = 0; i != n; i++) {
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case MIDDLE:
- for (int i = 0; i != n; i++) {
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case COMPLEX:
- for (int i = 0; i != n; i++) {
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- break;
- }
-
- if (newSize < n) {
- batch.size = newSize;
- batch.selectedInUse = true;
- }
- }
- } else {
- if (inputColVector.isRepeating) {
-
- //All must be selected otherwise size would be zero. Repeating property will not change.
- if (!nullPos[0]) {
- if (!like(vector[0], start[0], length[0])) {
-
- //Entire batch is filtered out.
- batch.size = 0;
- }
- } else {
- batch.size = 0;
- }
- } else if (batch.selectedInUse) {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case BEGIN:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case END:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case MIDDLE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case COMPLEX:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- }
-
- //Change the selected vector
- batch.size = newSize;
- } else {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case BEGIN:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case END:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case MIDDLE:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case COMPLEX:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- }
-
- if (newSize < n) {
- batch.size = newSize;
- batch.selectedInUse = true;
- }
-
- /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
- * future operations. So selectedInUse will remain false.
- */
- }
+ /**
+ * Accepts any LIKE patterns and creates corresponding checkers.
+ */
+ private static class ComplexCheckerFactory implements CheckerFactory {
+ public Checker tryCreate(String pattern) {
+ Checker checker = new ComplexChecker();
+ checker.setPattern(UDFLike.likePatternToRegExp(pattern));
+ return checker;
}
}
-
- @Override
- public int getOutputColumn() {
- return -1;
- }
-
- @Override
- public String getOutputType() {
- return "boolean";
- }
-
- public int getColNum() {
- return colNum;
- }
-
- public void setColNum(int colNum) {
- this.colNum = colNum;
- }
-
- public Pattern getCompiledPattern() {
- return compiledPattern;
- }
-
- public void setCompiledPattern(Pattern compiledPattern) {
- this.compiledPattern = compiledPattern;
- }
-
- public void setType(PatternType type) {
- this.type = type;
- }
-
- public String getSimpleStringPattern() {
- return simpleStringPattern;
- }
-
- public void setSimpleStringPattern(String simpleStringPattern) {
- this.simpleStringPattern = simpleStringPattern;
- simplePattern.set(simpleStringPattern);
- }
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
index 2304925..1b88bb2 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
@@ -30,11 +30,12 @@
* Evaluate REGEXP filter on a batch for a vector of strings.
*/
public class FilterStringColRegExpStringScalar extends AbstractFilterStringColLikeStringScalar {
+ private static final long serialVersionUID = 1L;
+
private static final String LITERAL_CHAR = "[^\\[\\]\\\\(){}*?+|$^.]";
- private static final String LITERAL_CHAR_OR_DOT = "[^\\[\\]\\\\(){}*?+|$^]";
private static final String LITERAL_CHAR_GROUP = "(" + LITERAL_CHAR + "+)";
- private static List