diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java new file mode 100644 index 0000000..da39b60 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java @@ -0,0 +1,365 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.io.Text; + +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; + +abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression { + private final int colNum; + private Checker checker; + + public AbstractFilterStringColLikeStringScalar(int colNum, Text likePattern) { + this.colNum = colNum; + checker = createChecker(likePattern.toString()); + } + + protected abstract List getCheckerFactories(); + + private Checker createChecker(String pattern) { + for (CheckerFactory checkerFactory : getCheckerFactories()) { + Checker checker = checkerFactory.tryCreate(pattern); + if (checker != null) { + return checker; + } + } + return null; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero Repeating property will not change. + if (!checker.check(vector[0], start[0], length[0])) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + + batch.size = newSize; + } else { + int newSize = 0; + for (int i = 0; i != n; i++) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + //All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + if (!checker.check(vector[0], start[0], length[0])) { + + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + //Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (checker.check(vector[i], start[i], length[i])) { + sel[newSize++] = i; + } + } + } + + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + + /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline + * future operations. So selectedInUse will remain false. + */ + } + } + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public String getOutputType() { + return "boolean"; + } + + protected static interface Checker { + boolean check(byte[] byteS, int start, int len); + } + + protected static interface CheckerFactory { + Checker tryCreate(String pattern); + } + + protected static class NoneChecker implements Checker { + byte [] byteSub; + + public NoneChecker(String regExpPattern) { + byteSub = new Text(regExpPattern).getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + int lenSub = byteSub.length; + if (len != lenSub) { + return false; + } + for (int i = start, j = 0; j < len; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + protected static class BeginChecker implements Checker { + byte [] byteSub; + + public BeginChecker(String group) { + byteSub = new Text(group).getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + if (len < byteSub.length) { + return false; + } + for (int i = start, j = 0; j < byteSub.length; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + protected static class EndChecker implements Checker { + byte [] byteSub; + + public EndChecker(String group) { + byteSub = new Text(group).getBytes(); + } + + public boolean check(byte[] byteS, int start, int len) { + int lenSub = byteSub.length; + if (len < lenSub) { + return false; + } + for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { + if (byteS[i] != byteSub[j]) { + return false; + } + } + return true; + } + } + + protected static class MiddleChecker implements Checker { + byte [] byteSub; + int lenSub; + + public MiddleChecker(String group) { + try { + byteSub = group.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + lenSub = byteSub.length; + } + + public boolean check(byte[] byteS, int start, int len) { + if (len < lenSub) { + return false; + } + int end = start + len - lenSub + 1; + boolean match = false; + for (int i = start; i < end; i++) { + match = true; + for (int j = 0; j < lenSub; j++) { + if (byteS[i + j] != byteSub[j]) { + match = false; + break; + } + } + if (match) { + return true; + } + } + return match; + } + } + + protected static class AnyCharChecker implements Checker { + private String regExpPattern; + private FastUTF8Decoder decoder; + private int patternLength; + private char anySingleCharPattern; + + public AnyCharChecker(char anySingleCharPattern, String regExpPattern) { + this.regExpPattern = regExpPattern; + this.patternLength = regExpPattern.length(); + this.anySingleCharPattern = anySingleCharPattern; + this.decoder = new FastUTF8Decoder(); + } + + public boolean check(byte[] byteS, int start, int len) { + CharBuffer decoded = decoder.decodeUnsafely(byteS, start, len); + if (decoded.length() != patternLength) { + return false; + } + + for (int i = 0; i < patternLength; i++) { + char c = regExpPattern.charAt(i); + if (c == anySingleCharPattern) { + continue; + } + if (c != decoded.charAt(i)) { + return false; + } + } + + return true; + } + } + + protected static class ComplexChecker implements Checker { + private final Pattern compiledPattern; + private final Matcher matcher; + private final FastUTF8Decoder decoder; + + public ComplexChecker(String regExpPattern) { + compiledPattern = Pattern.compile(regExpPattern); + matcher = compiledPattern.matcher(""); + decoder = new FastUTF8Decoder(); + } + + public boolean check(byte[] byteS, int start, int len) { + // Match the given bytes with the like pattern + matcher.reset(decoder.decodeUnsafely(byteS, start, len)); + return matcher.matches(); + } + } + + private static class FastUTF8Decoder { + private final CharsetDecoder decoder; + + private ByteBuffer byteBuffer; + private CharBuffer charBuffer; + + public FastUTF8Decoder() { + decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + byteBuffer = ByteBuffer.allocate(4); + charBuffer = CharBuffer.allocate(4); + } + + public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) { + // Prepare buffers + if (byteBuffer.capacity() < len) { + byteBuffer = ByteBuffer.allocate(len * 2); + } + byteBuffer.clear(); + byteBuffer.put(byteS, start, len); + byteBuffer.flip(); + + int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); + if (charBuffer.capacity() < maxChars) { + charBuffer = CharBuffer.allocate(maxChars); + } + charBuffer.clear(); + + // Decode UTF-8 + decoder.reset(); + decoder.decode(byteBuffer, charBuffer, true); + decoder.flush(charBuffer); + charBuffer.flip(); + + return charBuffer; + } + } + + String getCheckerName() { + return checker.getClass().getSimpleName(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index 24ba861..705af6f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -18,140 +18,99 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.io.Text; import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Evaluate LIKE filter on a batch for a vector of strings. */ -public class FilterStringColLikeStringScalar extends VectorExpression { - private int colNum; - private Text likePattern; - private Text s; - private UDFLike likeFunc; +public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar { + + private static List checkerFactories = Arrays.asList( + new LikeBeginCheckerFactory(), + new LikeEndCheckerFactory(), + new LikeMiddleCheckerFactory(), + new LikeAnyCharCheckerFactory(), + new LikeNoneCheckerFactory(), + new LikeComplexCheckerFactory()); public FilterStringColLikeStringScalar(int colNum, Text likePattern) { - this.colNum = colNum; - this.likePattern = likePattern; - likeFunc = new UDFLike(); - s = new Text(); + super(colNum, likePattern); } - /* - * This vectorized version of LIKE calls the standard LIKE - * function code. In the future, as an optimization, consider - * unwinding some of that logic here, e.g. to determine - * if the LIKE pattern is a simple one like 'abc%' so that - * can be executed more efficiently as a special case. - */ - - private boolean like(byte[] bytes, int start, int len) { - s.set(bytes, start, len); - return (likeFunc.evaluate(s, likePattern)).get(); + @Override + protected List getCheckerFactories() { + return checkerFactories; } - @Override - public void evaluate(VectorizedRowBatch batch) { - BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; - int[] sel = batch.selected; - boolean[] nullPos = inputColVector.isNull; - int n = batch.size; - byte[][] vector = inputColVector.vector; - int[] length = inputColVector.length; - int[] start = inputColVector.start; - - - // return immediately if batch is empty - if (n == 0) { - return; + private static class LikeBeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%"); + + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); + } + return null; } + } + + private static class LikeEndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)"); - if (inputColVector.noNulls) { - if (inputColVector.isRepeating) { - - // All must be selected otherwise size would be zero Repeating property will not change. - if (!like(vector[0], start[0], length[0])) { - - //Entire batch is filtered out. - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - for(int j=0; j != n; j++) { - int i = sel[j]; - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - batch.size = newSize; - } else { - int newSize = 0; - for(int i = 0; i != n; i++) { - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); } - } else { - if (inputColVector.isRepeating) { - - //All must be selected otherwise size would be zero. Repeating property will not change. - if (!nullPos[0]) { - if (!like(vector[0], start[0], length[0])) { - - //Entire batch is filtered out. - batch.size = 0; - } - } else { - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - for(int j=0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - - //Change the selected vector - batch.size = newSize; - } else { - int newSize = 0; - for(int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (like(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - - /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline - * future operations. So selectedInUse will remain false. - */ + return null; + } + } + + private static class LikeMiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%"); + + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); } + return null; } } - @Override - public int getOutputColumn() { - return -1; + private static class LikeAnyCharCheckerFactory implements CheckerFactory { + private static final Pattern ANY_CHAR_PATTERN = Pattern.compile("[^%_]*_[^%]*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = ANY_CHAR_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new AnyCharChecker('_', pattern); + } + return null; + } } - @Override - public String getOutputType() { - return "boolean"; + private static class LikeNoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+"); + + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(pattern); + } + return null; + } + } + + private static class LikeComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(UDFLike.likePatternToRegExp(pattern)); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java new file mode 100644 index 0000000..be624b8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java @@ -0,0 +1,225 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Evaluate REGEXP filter on a batch for a vector of strings. + */ +public class FilterStringColRegExpStringScalar extends AbstractFilterStringColLikeStringScalar { + private static final String LITERAL_CHAR = "[^\\[\\]\\\\(){}*?+|$^.]"; + private static final String LITERAL_CHAR_OR_DOT = "[^\\[\\]\\\\(){}*?+|$^]"; + private static final String LITERAL_CHAR_GROUP = "(" + LITERAL_CHAR + "+)"; + + private static List checkerFactories = Arrays.asList( + new RegExpBeginCheckerFactory(), + new RegExpEndCheckerFactory(), + new RegExpMiddleCheckerFactory(), + new RegExpAnyCharCheckerFactory(), + new PhoneNumberCheckerFactory(), + new RegExpNoneCheckerFactory(), + new RegExpComplexCheckerFactory()); + + public FilterStringColRegExpStringScalar(int colNum, Text regExpPattern) { + super(colNum, regExpPattern); + } + + @Override + protected List getCheckerFactories() { + return checkerFactories; + } + + private static class RegExpBeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP + "\\.\\*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); + } + return null; + } + } + + private static class RegExpEndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP); + + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); + } + return null; + } + } + + private static class RegExpMiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP + "\\.\\*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); + } + return null; + } + } + + private static class PhoneNumberChecker implements Checker { + private boolean twoGroups; + private boolean threeGroups; + + public PhoneNumberChecker(boolean twoGroups, boolean threeGroups) { + this.twoGroups = twoGroups; + this.threeGroups = threeGroups; + } + + public boolean check(byte[] byteS, int start, int len) { + int end = start + len; + int state = 0; + for (int i = start; i < end; i++) { + char c = (char) byteS[i]; + /** + * + state 0 + * V + * + input [0-9] + * V + * + state 1 <-+ + * V | + * +-----------+ input [0-9] + * V + * + input - + * V + * + state 2 + * V + * + input [0-9] + * V + * + state 3 <-+ + * V | + * +-----------+ input [0-9] + * V + * + input - + * V + * + state 4 + * V + * + input [0-9] + * V + * + state 5 <-+ + * V | + * +-----------+ input [0-9] + */ + switch (state) { + + // [0-9]: next + // other: false + case 0: + case 2: + case 4: + if ('0' <= c && c <= '9') { + state++; + } else { + return false; + } + break; + + // [0-9]: self + // -: next + // other: false + case 1: + case 3: + if ('0' <= c && c <= '9') { + // self + } else if (c == '-') { + state++; + } else { + return false; + } + break; + + // [0-9]: self + // other: false + case 5: + if ('0' <= c && c <= '9') { + // self + } else { + return false; + } + } + } + if (twoGroups && state == 3) { + return true; + } + if (threeGroups && state == 5) { + return true; + } + return false; + } + } + + private static class PhoneNumberCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + if (pattern.equals("[0-9]+-[0-9]+")) { + return new PhoneNumberChecker(true, false); + } + if (pattern.equals("[0-9]+-[0-9]+-[0-9]+")) { + return new PhoneNumberChecker(false, true); + } + if (pattern.equals("([0-9]+-)?[0-9]+-[0-9]+")) { + return new PhoneNumberChecker(true, true); + } + return null; + } + } + + private static class RegExpAnyCharCheckerFactory implements CheckerFactory { + private static final Pattern ANY_CHAR_PATTERN = Pattern.compile(LITERAL_CHAR + "*\\." + LITERAL_CHAR_OR_DOT + "*"); + + public Checker tryCreate(String pattern) { + Matcher matcher = ANY_CHAR_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new AnyCharChecker('.', pattern); + } + return null; + } + } + + private static class RegExpNoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP); + + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(pattern); + } + return null; + } + } + + private static class RegExpComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(pattern); + } + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 6e26412..73cf90c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -52,7 +52,9 @@ private static byte[] mixedUpLower; private static byte[] mixedUpUpper; private static byte[] multiByte; - private static byte[] mixPercentPattern; + private static byte[] shortPhoneNumber; + private static byte[] longPhoneNumber; + private static byte[] incompletePhoneNumber; static { try { @@ -67,8 +69,10 @@ mixedUp = "mixedUp".getBytes("UTF-8"); mixedUpLower = "mixedup".getBytes("UTF-8"); mixedUpUpper = "MIXEDUP".getBytes("UTF-8"); - mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE multiByte = new byte[100]; + shortPhoneNumber = "0123-4567".getBytes("UTF-8"); + longPhoneNumber = "012-3456-7890".getBytes("UTF-8"); + incompletePhoneNumber = "012-3456-".getBytes("UTF-8"); addMultiByteChars(multiByte); } catch (UnsupportedEncodingException e) { e.printStackTrace(); @@ -189,7 +193,7 @@ public void testStringColCompareStringColFilter() { expr.evaluate(batch); Assert.assertEquals(1, batch.size); Assert.assertEquals(0, batch.selected[0]); - + // no nulls possible batch = makeStringBatchForColColCompare(); batch.cols[0].noNulls = true; @@ -578,7 +582,7 @@ public void testStringLike() { Text pattern; int initialBatchSize; batch = makeStringBatchMixedCharSize(); - pattern = new Text(mixPercentPattern); + pattern = new Text("mix%"); FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern); expr.evaluate(batch); @@ -625,6 +629,236 @@ public void testStringLike() { } @Test + public void testStringLikePatternType() { + FilterStringColLikeStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColLikeStringScalar(0, new Text("mixed%")); + Assert.assertEquals("BeginChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%Up")); + Assert.assertEquals("EndChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%xed%")); + Assert.assertEquals("MiddleChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // AnyCharChecker + expr = new FilterStringColLikeStringScalar(0, new Text("m_x_d_p")); + Assert.assertEquals("AnyCharChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // ComplexChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%ix%Up")); + Assert.assertEquals("ComplexChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // NoneChecker + expr= new FilterStringColLikeStringScalar(0, new Text("mixedUp")); + Assert.assertEquals("NoneChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + } + + @Test + public void testStringLikeMultiByte() throws UnsupportedEncodingException { + FilterStringColLikeStringScalar expr; + VectorizedRowBatch batch; + byte[] percentBytes = "%".getBytes("UTF-8"); + byte[] xBytes = "x".getBytes("UTF-8"); + + // verify that a multi byte LIKE expression matches a matching string + batch = makeStringBatchMixedCharSize(); + Text text = new Text(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(percentBytes, 0, percentBytes.length); + expr = new FilterStringColLikeStringScalar(0, text); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + + // verify that a multi byte LIKE expression doesn't match a non-matching string + text.clear(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(xBytes, 0, xBytes.length); + batch = makeStringBatchMixedCharSize(); + expr = new FilterStringColLikeStringScalar(0, text); + expr.evaluate(batch); + Assert.assertEquals(0, batch.size); + } + + @Test + public void testStringRegExp() { + + // has nulls, not repeating + VectorizedRowBatch batch; + Text pattern; + int initialBatchSize; + batch = makeStringBatchMixedCharSize(); + pattern = new Text("mix.*"); + FilterStringColRegExpStringScalar expr = new FilterStringColRegExpStringScalar(0, pattern); + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + + // same, but repeating value is null + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + batch.cols[0].isNull[0] = true; + expr.evaluate(batch); + + // no rows qualify + Assert.assertEquals(0, batch.size); + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + } + + @Test + public void testStringRegExpPatternType() { + FilterStringColRegExpStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixed.*")); + Assert.assertEquals("BeginChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*Up")); + Assert.assertEquals("EndChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*xed.*")); + Assert.assertEquals("MiddleChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // AnyCharChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("m.x.d.p")); + Assert.assertEquals("AnyCharChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // ComplexChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*ix.*Up")); + Assert.assertEquals("ComplexChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // PhoneNumberChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("([0-9]+-)?[0-9]+-[0-9]+")); + Assert.assertEquals("PhoneNumberChecker", expr.getCheckerName()); + batch = makeStringBatchPhoneNumber(); + expr.evaluate(batch); + Assert.assertEquals(2, batch.size); + Assert.assertEquals(0, batch.selected[0]); + Assert.assertEquals(1, batch.selected[1]); + + // NoneChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixedUp")); + Assert.assertEquals("NoneChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + } + + private VectorizedRowBatch makeStringBatchPhoneNumber() { + + // create a new batch with one char column (for input) and one long column (for output) + VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[1] = outV; + + /* + * Add these 3 values: + * + * mixedUp + * green + * NULL + * <4 char string with multi-byte chars> + */ + v.setRef(0, shortPhoneNumber, 0, shortPhoneNumber.length); + v.isNull[0] = false; + v.setRef(1, longPhoneNumber, 0, longPhoneNumber.length); + v.isNull[1] = false; + v.setRef(2, incompletePhoneNumber, 0, incompletePhoneNumber.length); + v.isNull[2] = true; + v.noNulls = false; + v.setRef(3, green, 0, green.length); + v.isNull[3] = false; + + batch.size = 4; + return batch; + } + + @Test public void testColConcatScalar() { // has nulls, not repeating