diff --git itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java index 94af3e0..cccd810 100644 --- itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java +++ itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java @@ -13,6 +13,7 @@ */ package org.apache.hive.benchmark.vectorization; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -147,4 +148,16 @@ protected DoubleColumnVector getDoubleColumnVectorWithNull() { return columnVector; } + protected BytesColumnVector getBytesColumnVector() { + BytesColumnVector columnVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + int size = 64; + byte[] bytes = new byte[size * VectorizedRowBatch.DEFAULT_SIZE]; + for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) { + columnVector.vector[i] = bytes; + columnVector.start[i] = size * i; + columnVector.length[i] = size; + } + return columnVector; + } + } diff --git itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedBytesBench.java itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedBytesBench.java new file mode 100644 index 0000000..4c38530 --- /dev/null +++ itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedBytesBench.java @@ -0,0 +1,73 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hive.benchmark.vectorization; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.StringGroupColEqualStringGroupColumn; +import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.StringGroupColGreaterEqualStringGroupColumn; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +/** + * This test measures the performance for vectorization. + *

+ * This test uses JMH framework for benchmarking. + * You may execute this benchmark tool using JMH command line in different ways: + *

+ * To use the settings shown in the main() function, use: + * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedBytesBench + *

+ * To use the default settings used by JMH, use: + * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedBytesBench + *

+ * To specify different parameters, use: + * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it will + * display the Average Time (avgt) in Microseconds (us) + * - Benchmark mode. Available modes are: + * [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all] + * - Output time unit. Available time units are: [m, s, ms, us, ns]. + *

+ * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedBytesBench + * -wi 10 -i 5 -f 2 -bm avgt -tu us + */ +@State(Scope.Benchmark) +public class VectorizedBytesBench { + public static class StringColEqualStringColumnBench extends AbstractExpression { + @Override + public void setup() { + rowBatch = buildRowBatch(new LongColumnVector(), 2, getBytesColumnVector(), getBytesColumnVector()); + rowBatch.size = 128; + expression = new StringGroupColEqualStringGroupColumn(0, 1, 2); + } + } + + public static class StringColGreaterEqualStringColumnBench extends AbstractExpression { + @Override + public void setup() { + rowBatch = buildRowBatch(new LongColumnVector(), 2, getBytesColumnVector(), getBytesColumnVector()); + rowBatch.size = 128; + expression = new StringGroupColGreaterEqualStringGroupColumn(0, 1, 2); + } + } + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder().include(".*" + VectorizedBytesBench.class.getSimpleName() + + ".*").build(); + new Runner(opt).run(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FastByteComparisons.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FastByteComparisons.java new file mode 100644 index 0000000..9f9d171 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FastByteComparisons.java @@ -0,0 +1,238 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import java.lang.reflect.Field; +import java.nio.ByteOrder; +import java.security.AccessController; +import java.security.PrivilegedAction; + +import com.google.common.primitives.UnsignedBytes; +import sun.misc.Unsafe; + +/** + * Utility code to do optimized byte-array comparison. + * This is borrowed from Hadoop's {@code FastByteComparisons} class, + * which is borrowed and slightly modified from Guava's {@code + * FastByteComparisons} class to be able to compare arrays that start at + * non-zero offsets. + */ +abstract class FastByteComparisons { + + /** + * Lexicographically compare two byte arrays. + */ + public static int compareTo(byte[] b1, int s1, int l1, byte[] b2, int s2, + int l2) { + return LexicographicalComparerHolder.BEST_COMPARER.compareTo( + b1, s1, l1, b2, s2, l2); + } + + + interface Comparer { + abstract public int compareTo(T buffer1, int offset1, int length1, + T buffer2, int offset2, int length2); + } + + private static Comparer lexicographicalComparerJavaImpl() { + return LexicographicalComparerHolder.PureJavaComparer.INSTANCE; + } + + + /** + * Provides a lexicographical comparer implementation; either a Java + * implementation or a faster implementation based on {@link Unsafe}. + * + *

Uses reflection to gracefully fall back to the Java implementation if + * {@code Unsafe} isn't available. + */ + static class LexicographicalComparerHolder { + static final String UNSAFE_COMPARER_NAME = + LexicographicalComparerHolder.class.getName() + "$UnsafeComparer"; + + static final Comparer BEST_COMPARER = getBestComparer(); + /** + * Returns the Unsafe-using Comparer, or falls back to the pure-Java + * implementation if unable to do so. + */ + static Comparer getBestComparer() { + try { + Class theClass = Class.forName(UNSAFE_COMPARER_NAME); + + // yes, UnsafeComparer does implement Comparer + @SuppressWarnings("unchecked") + Comparer comparer = + (Comparer) theClass.getEnumConstants()[0]; + return comparer; + } catch (Throwable t) { // ensure we really catch *everything* + } + + return lexicographicalComparerJavaImpl(); + } + + enum PureJavaComparer implements Comparer { + INSTANCE; + + @Override + public int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 && + offset1 == offset2 && + length1 == length2) { + return 0; + } + // Bring WritableComparator code local + int end1 = offset1 + length1; + int end2 = offset2 + length2; + for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) { + int a = (buffer1[i] & 0xff); + int b = (buffer2[j] & 0xff); + if (a != b) { + return a - b; + } + } + return length1 - length2; + } + } + @SuppressWarnings("unused") // used via reflection + enum UnsafeComparer implements Comparer { + INSTANCE; + + static final Unsafe theUnsafe; + + /** The offset to the first element in a byte array. */ + static final int BYTE_ARRAY_BASE_OFFSET; + + static { + theUnsafe = (Unsafe) AccessController.doPrivileged( + new PrivilegedAction() { + @Override + public Object run() { + try { + Field f = Unsafe.class.getDeclaredField("theUnsafe"); + f.setAccessible(true); + return f.get(null); + } catch (NoSuchFieldException e) { + // It doesn't matter what we throw; + // it's swallowed in getBestComparer(). + throw new Error(); + } catch (IllegalAccessException e) { + throw new Error(); + } + } + }); + + BYTE_ARRAY_BASE_OFFSET = theUnsafe.arrayBaseOffset(byte[].class); + + // sanity check - this should never fail + if (theUnsafe.arrayIndexScale(byte[].class) != 1) { + throw new AssertionError(); + } + } + + static final boolean littleEndian = + ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN); + + /** + * Returns true if x1 is less than x2, when both values are treated as + * unsigned. + */ + static boolean lessThanUnsigned(long x1, long x2) { + return (x1 + Long.MIN_VALUE) < (x2 + Long.MIN_VALUE); + } + + /** + * Lexicographically compare two arrays. + * + * @param buffer1 left operand + * @param buffer2 right operand + * @param offset1 Where to start comparing in the left buffer + * @param offset2 Where to start comparing in the right buffer + * @param length1 How much to compare from the left buffer + * @param length2 How much to compare from the right buffer + * @return 0 if equal, < 0 if left is less than right, etc. + */ + @Override + public int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 && + offset1 == offset2 && + length1 == length2) { + return 0; + } + int unit = 8; + int minLength = Math.min(length1, length2); + int minWords = minLength / unit; + int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET; + int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET; + + /* + * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a + * time is no slower than comparing 4 bytes at a time even on 32-bit. + * On the other hand, it is substantially faster on 64-bit. + */ + for (int i = 0; i < minWords * unit; i += unit) { + long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i); + long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i); + long diff = lw ^ rw; + + if (diff != 0) { + if (!littleEndian) { + return lessThanUnsigned(lw, rw) ? -1 : 1; + } + + // Use binary search + int n = 0; + int y; + int x = (int) diff; + if (x == 0) { + x = (int) (diff >>> 32); + n = 32; + } + + y = x << 16; + if (y == 0) { + n += 16; + } else { + x = y; + } + + y = x << 8; + if (y == 0) { + n += 8; + } + return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL)); + } + } + + // The epilogue to cover the last (minLength % 8) elements. + for (int i = minWords * unit; i < minLength; i++) { + int result = UnsignedBytes.compare( + buffer1[offset1 + i], + buffer2[offset2 + i]); + if (result != 0) { + return result; + } + } + return length1 - length2; + } + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java index 90817a5..c177a27 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java @@ -35,15 +35,7 @@ * positive if arg1 > arg2. */ public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) { - for (int i = 0; i < len1 && i < len2; i++) { - // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer. - int b1 = arg1[i + start1] & 0xff; - int b2 = arg2[i + start2] & 0xff; - if (b1 != b2) { - return b1 - b2; - } - } - return len1 - len2; + return FastByteComparisons.compareTo(arg1, start1, len1, arg2, start2, len2); } /* Determine if two strings are equal from two byte arrays each @@ -53,48 +45,7 @@ public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int st */ public static boolean equal(byte[] arg1, final int start1, final int len1, byte[] arg2, final int start2, final int len2) { - if (len1 != len2) { - return false; - } - if (len1 == 0) { - return true; - } - - // do bounds check for OOB exception - if (arg1[start1] != arg2[start2] - || arg1[start1 + len1 - 1] != arg2[start2 + len2 - 1]) { - return false; - } - - if (len1 == len2) { - // prove invariant to the compiler: len1 = len2 - // all array access between (start1, start1+len1) - // and (start2, start2+len2) are valid - // no more OOB exceptions are possible - final int step = 8; - final int remainder = len1 % step; - final int wlen = len1 - remainder; - // suffix first - for (int i = wlen; i < len1; i++) { - if (arg1[start1 + i] != arg2[start2 + i]) { - return false; - } - } - // SIMD loop - for (int i = 0; i < wlen; i += step) { - final int s1 = start1 + i; - final int s2 = start2 + i; - boolean neq = false; - for (int j = 0; j < step; j++) { - neq = (arg1[s1 + j] != arg2[s2 + j]) || neq; - } - if (neq) { - return false; - } - } - } - - return true; + return FastByteComparisons.compareTo(arg1, start1, len1, arg2, start2, len2) == 0; } public static int characterCount(byte[] bytes) { diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestFastByteComparisons.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestFastByteComparisons.java new file mode 100644 index 0000000..b0af65c --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestFastByteComparisons.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestFastByteComparisons { + @Test + public void testUnsafeComparer() { + FastByteComparisons.Comparer unsafeComparer = + FastByteComparisons.LexicographicalComparerHolder.UnsafeComparer.INSTANCE; + FastByteComparisons.Comparer pureJavaComparer = + FastByteComparisons.LexicographicalComparerHolder.PureJavaComparer.INSTANCE; + + byte[] a; + byte[] b; + + a = "The quick brown fox jumps over the lazy dog".getBytes(); + b = "The quick brown fox jumps over the lazy dog".getBytes(); + assertEquals(Integer.signum(pureJavaComparer.compareTo(a, 0, a.length, b, 0, b.length)), + Integer.signum(unsafeComparer.compareTo(a, 0, a.length, b, 0, b.length))); + + b = "The quick brown fox jumps over the lazy elephant".getBytes(); + assertEquals(Integer.signum(pureJavaComparer.compareTo(a, 0, a.length, b, 0, b.length)), + Integer.signum(unsafeComparer.compareTo(a, 0, a.length, b, 0, b.length))); + + b = "The quick green fox jumps over the lazy dog".getBytes(); + assertEquals(Integer.signum(pureJavaComparer.compareTo(a, 0, a.length, b, 0, b.length)), + Integer.signum(unsafeComparer.compareTo(a, 0, a.length, b, 0, b.length))); + + b = "Le quick green fox jumps over the lazy dog".getBytes(); + assertEquals(Integer.signum(pureJavaComparer.compareTo(a, 0, a.length, b, 0, b.length)), + Integer.signum(unsafeComparer.compareTo(a, 0, a.length, b, 0, b.length))); + } +} \ No newline at end of file