diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index 3779091..e51f00a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -18,514 +18,101 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.udf.UDFLike; import org.apache.hadoop.io.Text; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp; - /** * Evaluate LIKE filter on a batch for a vector of strings. */ -public class FilterStringColLikeStringScalar extends VectorExpression { - private int colNum; - private Pattern compiledPattern; - private final Text simplePattern = new Text(); - private ByteBuffer byteBuffer; - private CharBuffer charBuffer; - private CharsetDecoder decoder; - private PatternType type = PatternType.NONE; +public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar { - // Doing characters comparison directly instead of regular expression - // matching for simple patterns like "%abc%". - enum PatternType { - NONE, // "abc" - BEGIN, // "abc%" - END, // "%abc" - MIDDLE, // "%abc%" - COMPLEX, // all other cases, such as "ab%c_de" - } + private static List checkerFactories = Arrays.asList( + new BeginCheckerFactory(), + new EndCheckerFactory(), + new MiddleCheckerFactory(), + new NoneCheckerFactory(), + new ComplexCheckerFactory()); public FilterStringColLikeStringScalar(int colNum, Text likePattern) { - this.colNum = colNum; - String stringLikePattern = likePattern.toString(); - parseSimplePattern(stringLikePattern); - if (type == PatternType.COMPLEX) { - compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern)); - } - decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - byteBuffer = ByteBuffer.allocate(4); - charBuffer = CharBuffer.allocate(4); + super(colNum, likePattern); } - PatternType getType() { - return type; - } - - private boolean like(byte[] bytes, int start, int len) { - switch (type) { - case NONE: - return noneLike(bytes, start, len, simplePattern.getBytes()); - case BEGIN: - return beginLike(bytes, start, len, simplePattern.getBytes()); - case END: - return endLike(bytes, start, len, simplePattern.getBytes()); - case MIDDLE: - return midLike(bytes, start, len, simplePattern.getBytes()); - case COMPLEX: - return complexLike(bytes, start, len); - default: - return false; - } + @Override + protected List getCheckerFactories() { + return checkerFactories; } - private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len != lenSub) { - return false; - } - for (int i = start, j = 0; j < len; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; - } - } - return true; - } + /** + * Accepts simple LIKE patterns like "abc%" and creates corresponding checkers. + */ + private static class BeginCheckerFactory implements CheckerFactory { + private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%"); - private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) { - if (len < byteSub.length) { - return false; - } - for (int i = start, j = 0; j < byteSub.length; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; + public Checker tryCreate(String pattern) { + Matcher matcher = BEGIN_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new BeginChecker(matcher.group(1)); } + return null; } - return true; } - private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len < lenSub) { - return false; - } - for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) { - if (byteS[i] != byteSub[j]) { - return false; - } - } - return true; - } + /** + * Accepts simple LIKE patterns like "%abc" and creates a corresponding checkers. + */ + private static class EndCheckerFactory implements CheckerFactory { + private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)"); - private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) { - int lenSub = byteSub.length; - if (len < lenSub) { - return false; - } - int end = start + len - lenSub + 1; - boolean match = false; - for (int i = start; (i < end) && (!match); i++) { - match = true; - for (int j = 0; j < lenSub; j++) { - if (byteS[i + j] != byteSub[j]) { - match = false; - break; - } + public Checker tryCreate(String pattern) { + Matcher matcher = END_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new EndChecker(matcher.group(1)); } + return null; } - return match; } /** - * Matches the byte array against the complex like pattern. This method uses - * {@link #compiledPattern} to match. For decoding performance, it caches - * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}. - * When the length to decode is greater than the capacity of - * {@link #byteBuffer}, it creates new {@link #byteBuffer} and - * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the - * double of the length, for fewer object creations and higher memory - * utilization. - * - * @param byteS - * A byte array that contains a UTF-8 string. - * @param start - * A position to start decoding. - * @param len - * A length to decode. - * @return - * true if the byte array matches the complex like pattern, - * otherwise false. + * Accepts simple LIKE patterns like "%abc%" and creates a corresponding checkers. */ - private boolean complexLike(byte[] byteS, int start, int len) { - // Prepare buffers - if (byteBuffer.capacity() < len) { - byteBuffer = ByteBuffer.allocate(len * 2); - } - byteBuffer.clear(); - byteBuffer.put(byteS, start, len); - byteBuffer.flip(); + private static class MiddleCheckerFactory implements CheckerFactory { + private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%"); - int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte()); - if (charBuffer.capacity() < maxChars) { - charBuffer = CharBuffer.allocate(maxChars); + public Checker tryCreate(String pattern) { + Matcher matcher = MIDDLE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new MiddleChecker(matcher.group(1)); + } + return null; } - charBuffer.clear(); - - // Decode UTF-8 - decoder.reset(); - decoder.decode(byteBuffer, charBuffer, true); - decoder.flush(charBuffer); - charBuffer.flip(); - - // Match the given bytes with the like pattern - return compiledPattern.matcher(charBuffer).matches(); } /** - * Parses the likePattern. Based on it is a simple pattern or not, the - * function might change two member variables. {@link #type} will be changed - * to the corresponding pattern type; {@link #simplePattern} will record the - * string in it for later pattern matching if it is a simple pattern. - *

- * Examples:

- * - *
-   * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
-   * and changes {@link #simplePattern} to "abc"
-   * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
-   * and does not change {@link #simplePattern}
-   * 
- * - *
- * - * @param likePattern - * the input LIKE query pattern + * Accepts simple LIKE patterns like "abc" and creates corresponding checkers. */ - private void parseSimplePattern(String likePattern) { - int length = likePattern.length(); - int beginIndex = 0; - int endIndex = length; - char lastChar = 'a'; - String strPattern = new String(); - type = PatternType.NONE; + private static class NoneCheckerFactory implements CheckerFactory { + private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+"); - for (int i = 0; i < length; i++) { - char n = likePattern.charAt(i); - if (n == '_') { // such as "a_b" - if (lastChar != '\\') { // such as "a%bc" - type = PatternType.COMPLEX; - return; - } else { // such as "abc\%de%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - } - } else if (n == '%') { - if (i == 0) { // such as "%abc" - type = PatternType.END; - beginIndex = 1; - } else if (i < length - 1) { - if (lastChar != '\\') { // such as "a%bc" - type = PatternType.COMPLEX; - return; - } else { // such as "abc\%de%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - } - } else { - if (lastChar != '\\') { - endIndex = length - 1; - if (type == PatternType.END) { // such as "%abc%" - type = PatternType.MIDDLE; - } else { - type = PatternType.BEGIN; // such as "abc%" - } - } else { // such as "abc\%" - strPattern += likePattern.substring(beginIndex, i - 1); - beginIndex = i; - endIndex = length; - } - } + public Checker tryCreate(String pattern) { + Matcher matcher = NONE_PATTERN.matcher(pattern); + if (matcher.matches()) { + return new NoneChecker(pattern); } - lastChar = n; + return null; } - - strPattern += likePattern.substring(beginIndex, endIndex); - simplePattern.set(strPattern); } - @Override - public void evaluate(VectorizedRowBatch batch) { - - if (childExpressions != null) { - super.evaluateChildren(batch); - } - - BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; - int[] sel = batch.selected; - boolean[] nullPos = inputColVector.isNull; - int n = batch.size; - byte[][] vector = inputColVector.vector; - int[] length = inputColVector.length; - int[] start = inputColVector.start; - byte[] simplePatternBytes = simplePattern.getBytes(); - - // return immediately if batch is empty - if (n == 0) { - return; - } - - if (inputColVector.noNulls) { - if (inputColVector.isRepeating) { - - // All must be selected otherwise size would be zero Repeating property will not change. - if (!like(vector[0], start[0], length[0])) { - - // Entire batch is filtered out. - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - - switch (type) { - case NONE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case BEGIN: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case END: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case MIDDLE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case COMPLEX: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - break; - } - - batch.size = newSize; - } else { - int newSize = 0; - - switch (type) { - case NONE: - for (int i = 0; i != n; i++) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case BEGIN: - for (int i = 0; i != n; i++) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case END: - for (int i = 0; i != n; i++) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case MIDDLE: - for (int i = 0; i != n; i++) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - break; - case COMPLEX: - for (int i = 0; i != n; i++) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - break; - } - - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - } - } else { - if (inputColVector.isRepeating) { - - //All must be selected otherwise size would be zero. Repeating property will not change. - if (!nullPos[0]) { - if (!like(vector[0], start[0], length[0])) { - - //Entire batch is filtered out. - batch.size = 0; - } - } else { - batch.size = 0; - } - } else if (batch.selectedInUse) { - int newSize = 0; - - switch (type) { - case NONE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case BEGIN: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case END: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case MIDDLE: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case COMPLEX: - for (int j = 0; j != n; j++) { - int i = sel[j]; - if (!nullPos[i]) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - break; - } - - //Change the selected vector - batch.size = newSize; - } else { - int newSize = 0; - - switch (type) { - case NONE: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case BEGIN: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case END: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (endLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case MIDDLE: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (midLike(vector[i], start[i], length[i], simplePatternBytes)) { - sel[newSize++] = i; - } - } - } - break; - case COMPLEX: - for (int i = 0; i != n; i++) { - if (!nullPos[i]) { - if (complexLike(vector[i], start[i], length[i])) { - sel[newSize++] = i; - } - } - } - break; - } - - if (newSize < n) { - batch.size = newSize; - batch.selectedInUse = true; - } - - /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline - * future operations. So selectedInUse will remain false. - */ - } + /** + * Accepts any LIKE patterns and creates corresponding checkers. + */ + private static class ComplexCheckerFactory implements CheckerFactory { + public Checker tryCreate(String pattern) { + return new ComplexChecker(UDFLike.likePatternToRegExp(pattern)); } } - - @Override - public int getOutputColumn() { - return -1; - } - - @Override - public String getOutputType() { - return "boolean"; - } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 3efc1b1..04e9508 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -54,7 +54,9 @@ private static byte[] mixedUpLower; private static byte[] mixedUpUpper; private static byte[] multiByte; - private static byte[] mixPercentPattern; + private static byte[] shortPhoneNumber; + private static byte[] longPhoneNumber; + private static byte[] incompletePhoneNumber; static { try { @@ -69,8 +71,10 @@ mixedUp = "mixedUp".getBytes("UTF-8"); mixedUpLower = "mixedup".getBytes("UTF-8"); mixedUpUpper = "MIXEDUP".getBytes("UTF-8"); - mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE multiByte = new byte[100]; + shortPhoneNumber = "0123-4567".getBytes("UTF-8"); + longPhoneNumber = "(012) 345-6789".getBytes("UTF-8"); + incompletePhoneNumber = "012-3456-".getBytes("UTF-8"); addMultiByteChars(multiByte); } catch (UnsupportedEncodingException e) { e.printStackTrace(); @@ -610,7 +614,7 @@ public void testStringLike() { Text pattern; int initialBatchSize; batch = makeStringBatchMixedCharSize(); - pattern = new Text(mixPercentPattern); + pattern = new Text("mix%"); FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern); expr.evaluate(batch); @@ -656,50 +660,209 @@ public void testStringLike() { Assert.assertEquals(initialBatchSize, batch.size); } + @Test public void testStringLikePatternType() { FilterStringColLikeStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColLikeStringScalar(0, new Text("mixed%")); + Assert.assertEquals("BeginChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%Up")); + Assert.assertEquals("EndChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%xed%")); + Assert.assertEquals("MiddleChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); - // BEGIN pattern - expr = new FilterStringColLikeStringScalar(0, new Text("abc%")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN, - expr.getType()); - - // END pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END, - expr.getType()); - - // MIDDLE pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc%")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE, - expr.getType()); - - // COMPLEX pattern - expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX, - expr.getType()); - - // NONE pattern - expr = new FilterStringColLikeStringScalar(0, new Text("abc")); - Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE, - expr.getType()); + // ComplexChecker + expr = new FilterStringColLikeStringScalar(0, new Text("%ix%Up")); + Assert.assertEquals("ComplexChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // NoneChecker + expr= new FilterStringColLikeStringScalar(0, new Text("mixedUp")); + Assert.assertEquals("NoneChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); } - public void testStringLikeMultiByte() { + @Test + public void testStringLikeMultiByte() throws UnsupportedEncodingException { FilterStringColLikeStringScalar expr; VectorizedRowBatch batch; + byte[] percentBytes = "%".getBytes("UTF-8"); + byte[] xBytes = "x".getBytes("UTF-8"); // verify that a multi byte LIKE expression matches a matching string batch = makeStringBatchMixedCharSize(); - expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "%")); + Text text = new Text(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(percentBytes, 0, percentBytes.length); + expr = new FilterStringColLikeStringScalar(0, text); expr.evaluate(batch); - Assert.assertEquals(batch.size, 1); + Assert.assertEquals(1, batch.size); // verify that a multi byte LIKE expression doesn't match a non-matching string + text.clear(); + text.append(percentBytes, 0, percentBytes.length); + text.append(multiByte, 0, 10); + text.append(xBytes, 0, xBytes.length); batch = makeStringBatchMixedCharSize(); - expr = new FilterStringColLikeStringScalar(0, new Text("%" + multiByte + "x")); + expr = new FilterStringColLikeStringScalar(0, text); expr.evaluate(batch); - Assert.assertEquals(batch.size, 0); + Assert.assertEquals(0, batch.size); + } + + @Test + public void testStringRegExp() { + + // has nulls, not repeating + VectorizedRowBatch batch; + Text pattern; + int initialBatchSize; + batch = makeStringBatchMixedCharSize(); + pattern = new Text("mix.*"); + FilterStringColRegExpStringScalar expr = new FilterStringColRegExpStringScalar(0, pattern); + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // no nulls, not repeating + batch = makeStringBatchMixedCharSize(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // verify that the beginning entry is the only one that matches + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // has nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + + // same, but repeating value is null + batch = makeStringBatchMixedCharSize(); + batch.cols[0].isRepeating = true; + batch.cols[0].isNull[0] = true; + expr.evaluate(batch); + + // no rows qualify + Assert.assertEquals(0, batch.size); + + // no nulls, is repeating + batch = makeStringBatchMixedCharSize(); + initialBatchSize = batch.size; + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + + // all rows qualify + Assert.assertEquals(initialBatchSize, batch.size); + } + + @Test + public void testStringRegExpPatternType() { + FilterStringColRegExpStringScalar expr; + VectorizedRowBatch batch; + + // BeginChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixed.*")); + Assert.assertEquals("BeginChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // EndChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*Up")); + Assert.assertEquals("EndChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // MiddleChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*xed.*")); + Assert.assertEquals("MiddleChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // ComplexChecker + expr = new FilterStringColRegExpStringScalar(0, new Text(".*ix.*Up")); + Assert.assertEquals("ComplexChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + + // PhoneNumberChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("\\(\\d\\d\\d\\) \\d\\d\\d-\\d\\d\\d\\d")); + Assert.assertEquals("PhoneNumberChecker", expr.getCheckerName()); + batch = makeStringBatchPhoneNumber(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(1, batch.selected[0]); + + // NoneChecker + expr = new FilterStringColRegExpStringScalar(0, new Text("mixedUp")); + Assert.assertEquals("NoneChecker", expr.getCheckerName()); + batch = makeStringBatchMixedCharSize(); + expr.evaluate(batch); + Assert.assertEquals(1, batch.size); + Assert.assertEquals(0, batch.selected[0]); + } + + private VectorizedRowBatch makeStringBatchPhoneNumber() { + + // create a new batch with one char column (for input) and one long column (for output) + VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); + BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = v; + LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[1] = outV; + + v.setRef(0, shortPhoneNumber, 0, shortPhoneNumber.length); + v.isNull[0] = false; + v.setRef(1, longPhoneNumber, 0, longPhoneNumber.length); + v.isNull[1] = false; + v.setRef(2, incompletePhoneNumber, 0, incompletePhoneNumber.length); + v.isNull[2] = true; + v.noNulls = false; + v.setRef(3, green, 0, green.length); + v.isNull[3] = false; + + batch.size = 4; + return batch; } @Test