diff --git itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java index 94af3e0..879b437 100644 --- itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java +++ itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java @@ -13,6 +13,7 @@ */ package org.apache.hive.benchmark.vectorization; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -35,7 +36,7 @@ @BenchmarkMode(Mode.AverageTime) @Fork(1) @State(Scope.Thread) -@OutputTimeUnit(TimeUnit.NANOSECONDS) +@OutputTimeUnit(TimeUnit.MILLISECONDS) public abstract class AbstractExpression { private static final int DEFAULT_ITER_TIME = 1000000; protected VectorExpression expression; @@ -59,6 +60,9 @@ protected VectorizedRowBatch buildRowBatch(ColumnVector output, int colNum, Colu @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) public void bench() { for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + rowBatch.selectedInUse = false; + rowBatch.size = VectorizedRowBatch.DEFAULT_SIZE; + expression.evaluate(rowBatch); } } @@ -147,4 +151,18 @@ protected DoubleColumnVector getDoubleColumnVectorWithNull() { return columnVector; } + protected BytesColumnVector getBytesColumnVector() { + BytesColumnVector columnVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + Random random = new Random(); + int length = 16; + for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) { + columnVector.vector[i] = new byte[length]; + columnVector.start[i] = 0; + columnVector.length[i] = length; + for (int j = 0; j < length; j++) { + columnVector.vector[i][j] = (byte)(random.nextInt(+ 'c' - 'a' + 1) + 'a'); + } + } + return columnVector; + } } diff --git itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java new file mode 100644 index 0000000..136c01b --- /dev/null +++ itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hive.benchmark.vectorization; + +import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.nio.charset.StandardCharsets; + +/** + * This test measures the performance for vectorization. + *

+ * This test uses JMH framework for benchmarking. + * You may execute this benchmark tool using JMH command line in different ways: + *

+ * To use the settings shown in the main() function, use: + * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedLikeBench + *

+ * To use the default settings used by JMH, use: + * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedLikeBench + *

+ * To specify different parameters, use: + * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it will + * display the Average Time (avgt) in Microseconds (us) + * - Benchmark mode. Available modes are: + * [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all] + * - Output time unit. Available time units are: [m, s, ms, us, ns]. + *

+ * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedLikeBench + * -wi 10 -i 5 -f 2 -bm avgt -tu us + */ +@State(Scope.Benchmark) +public class VectorizedLikeBench { + public static class FilterStringColLikeStringScalarBench extends AbstractExpression { + @Override + public void setup() { + rowBatch = buildRowBatch(null, 1, getBytesColumnVector()); + expression = new FilterStringColLikeStringScalar(0, "%aabb%".getBytes(StandardCharsets.UTF_8)); + } + } + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder().include(".*" + VectorizedLikeBench.class.getSimpleName() + + ".*").build(); + new Runner(opt).run(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java index c50af8d..85609c7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java @@ -21,9 +21,9 @@ import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.CharBuffer; -import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; @@ -296,16 +296,10 @@ public boolean check(byte[] byteS, int start, int len) { * Matches the middle of each string to its pattern. */ protected static final class MiddleChecker implements Checker { - final byte[] byteSub; - final int lenSub; + final StringExpr.Finder finder; MiddleChecker(String pattern) { - try { - byteSub = pattern.getBytes("UTF-8"); - lenSub = byteSub.length; - } catch (UnsupportedEncodingException e) { - throw new RuntimeException(e); - } + finder = StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8)); } public boolean check(byte[] byteS, int start, int len) { @@ -316,16 +310,7 @@ public boolean check(byte[] byteS, int start, int len) { * Returns absolute offset of the match */ public int index(byte[] byteS, int start, int len) { - if (len < lenSub) { - return -1; - } - int end = start + len - lenSub + 1; - for (int i = start; i < end; i++) { - if (StringExpr.equal(byteSub, 0, lenSub, byteS, i, lenSub)) { - return i; - } - } - return -1; + return finder.find(byteS, start, len); } } @@ -469,7 +454,7 @@ public boolean check(byte[] byteS, int start, int len) { CharBuffer charBuffer; public FastUTF8Decoder() { - decoder = Charset.forName("UTF-8").newDecoder() + decoder = StandardCharsets.UTF_8.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); byteBuffer = ByteBuffer.allocate(4); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java index ad9f75e..6f58160 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java @@ -18,10 +18,9 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; -import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.UDFLike; -import org.apache.hadoop.io.Text; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; @@ -45,13 +44,9 @@ public FilterStringColLikeStringScalar() { super(); } - public FilterStringColLikeStringScalar(int colNum, byte[] likePattern) throws HiveException { + public FilterStringColLikeStringScalar(int colNum, byte[] likePattern) { super(colNum, null); - try { - super.setPattern(new String(likePattern, "UTF-8")); - } catch (Exception ex) { - throw new HiveException(ex); - } + super.setPattern(new String(likePattern, StandardCharsets.UTF_8)); } @Override diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java index 90817a5..f2ae9bc 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java @@ -351,4 +351,64 @@ public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] by return Arrays.copyOf(bytes, j); } } + + /* + * Compiles the given pattern with a proper algorithm. + */ + public static Finder compile(byte[] pattern) { + return new BoyerMooreHorspool(pattern); + } + + /* + * A finder finds the first index of its pattern in a given byte array. + * Its thread-safety depends on its implementation. + */ + public interface Finder { + int find(byte[] input, int start, int len); + } + + /* + * StringExpr uses Boyer Moore Horspool algorithm to find faster. + * It is thread-safe, because it holds final member instances only. + * See https://en.wikipedia.org/wiki/Boyer–Moore–Horspool_algorithm . + */ + private static class BoyerMooreHorspool implements Finder { + private static final int MAX_BYTE = 0xff; + private final long[] shift = new long[MAX_BYTE]; + private final byte[] pattern; + private final int plen; + + public BoyerMooreHorspool(byte[] pattern) { + this.pattern = pattern; + this.plen = pattern.length; + Arrays.fill(shift, plen); + for (int i = 0; i < plen - 1; i++) { + shift[pattern[i] & MAX_BYTE] = plen - i - 1; + } + } + + public int find(byte[] input, int start, int len) { + if (pattern.length == 0) { + return 0; + } + + final int end = start + len; + int next = start + plen - 1; + final int plen = this.plen; + final byte[] pattern = this.pattern; + while (next < end) { + int s_tmp = next; + int p_tmp = plen - 1; + while (input[s_tmp] == pattern[p_tmp]) { + p_tmp--; + if (p_tmp < 0) { + return s_tmp; + } + s_tmp--; + } + next += shift[input[next] & MAX_BYTE]; + } + return -1; + } + } } diff --git storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java new file mode 100644 index 0000000..63c210a --- /dev/null +++ storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.junit.Test; + +import java.nio.charset.StandardCharsets; + +import static org.junit.Assert.*; + +public class TestStringExpr { + @Test + public void test() throws Exception { + StringExpr.Finder pattern = compile("pattern"); + assertNotNull(pattern); + + StringExpr.Finder patternOneChar = compile("g"); + assertNotNull(patternOneChar); + + StringExpr.Finder patternZero = compile(""); + assertNotNull(patternZero); + + String input1 = "string that contains a patterN..."; + String input2 = "string that contains a pattern..."; + String input3 = "pattern at the start of a string"; + String input4 = "string that ends with a pattern"; + + assertEquals("Testing invalid match", -1, find(pattern, input1)); + assertEquals("Testing valid match", 23, find(pattern, input2)); + assertEquals("Testing single-character match", 5, find(patternOneChar, input1)); + assertEquals("Testing zero-length pattern", 0, find(patternZero, input1)); + assertEquals("Testing match at start of string", 0, find(pattern, input3)); + assertEquals("Testing match at end of string", 24, find(pattern, input4)); + } + + private StringExpr.Finder compile(String pattern) { + return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8)); + } + + private int find(StringExpr.Finder finder, String string) { + byte[] bytes = string.getBytes(StandardCharsets.UTF_8); + return finder.find(bytes, 0, bytes.length); + } +} \ No newline at end of file