diff --git itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java index 826bf53..5dbc571 100644 --- itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java +++ itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java @@ -15,6 +15,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Date; import java.util.Random; import java.util.concurrent.TimeUnit; @@ -28,6 +29,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazyLong; import org.apache.hadoop.hive.serde2.lazy.LazyShort; import org.apache.hadoop.hive.serde2.lazy.LazyTimestamp; +import org.apache.hadoop.hive.serde2.lazy.fast.StringToDouble; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -62,7 +64,7 @@ @BenchmarkMode(Mode.AverageTime) @Fork(1) @State(Scope.Thread) - @OutputTimeUnit(TimeUnit.NANOSECONDS) + @OutputTimeUnit(TimeUnit.MILLISECONDS) public static abstract class AbstractDeserializer { public int[] offsets = new int[DEFAULT_DATA_SIZE]; @@ -449,8 +451,32 @@ public void bench() { @BenchmarkMode(Mode.AverageTime) @Fork(1) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + @Warmup(iterations = 4, time = 2, timeUnit = TimeUnit.MILLISECONDS) + @Measurement(iterations = 4, time = 2, timeUnit = TimeUnit.MILLISECONDS) @State(Scope.Thread) - @OutputTimeUnit(TimeUnit.NANOSECONDS) + public static class ParseDouble { + byte[] bytes = "1234567890.12345".getBytes(StandardCharsets.UTF_8); + + @Benchmark + public void floatingDecimalBench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + StringToDouble.strtod(bytes, 0, bytes.length); + } + } + + @Benchmark + public void doubleBench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + Double.parseDouble(new String(bytes, 0, bytes.length, StandardCharsets.UTF_8)); + } + } + } + + @BenchmarkMode(Mode.AverageTime) + @Fork(1) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.MILLISECONDS) public static class GoodLazyDate { final LazyDate obj = new LazyDate( @@ -533,7 +559,7 @@ public void bench() { @BenchmarkMode(Mode.AverageTime) @Fork(1) @State(Scope.Thread) - @OutputTimeUnit(TimeUnit.NANOSECONDS) + @OutputTimeUnit(TimeUnit.MILLISECONDS) public static class GoodLazyTimestamp { final LazyTimestamp obj = new LazyTimestamp( diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index 264335c..05aa84f 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -446,9 +446,7 @@ public boolean readField(int fieldIndex) throws IOException { if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { return false; } - currentDouble = - Double.parseDouble( - new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); + currentDouble = StringToDouble.strtod(bytes, fieldStart, fieldLength); return true; case STRING: case CHAR: diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/StringToDouble.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/StringToDouble.java new file mode 100644 index 0000000..b9259e0 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/StringToDouble.java @@ -0,0 +1,256 @@ +/* + * Source code for the "strtod" library procedure. + * + * Copyright 1988-1992 Regents of the University of California + * Permission to use, copy, modify, and distribute this + * software and its documentation for any purpose and without + * fee is hereby granted, provided that the above copyright + * notice appear in all copies. The University of California + * makes no representations about the suitability of this + * software for any purpose. It is provided "as is" without + * express or implied warranty. + */ +package org.apache.hadoop.hive.serde2.lazy.fast; + +import java.nio.charset.StandardCharsets; + +public class StringToDouble { + static int maxExponent = 511; /* Largest possible base 10 exponent. Any + * exponent larger than this will already + * produce underflow or overflow, so there's + * no need to worry about additional digits. + */ + static double powersOf10[] = { /* Table giving binary powers of 10. Entry */ + 10., /* is 10^2^i. Used to convert decimal */ + 100., /* exponents into floating-point numbers. */ + 1.0e4, + 1.0e8, + 1.0e16, + 1.0e32, + 1.0e64, + 1.0e128, + 1.0e256 + }; + + public static double strtod(String s) { + final byte[] utf8 = s.getBytes(StandardCharsets.UTF_8); + return strtod(utf8, 0, utf8.length); + } + + public static double strtod(byte[] utf8, int offset, int length) + { + boolean signIsNegative = true; + boolean expSignIsNegative = true; + double fraction; + int d; + int p = offset; + int end = offset + length; + int c; + int exp = 0; /* Exponent read from "EX" field. */ + int fracExp = 0; /* Exponent that derives from the fractional + * part. Under normal circumstatnces, it is + * the negative of the number of digits in F. + * However, if I is very long, the last digits + * of I get dropped (otherwise a long I with a + * large negative exponent could cause an + * unnecessary overflow on I alone). In this + * case, fracExp is incremented one for each + * dropped digit. */ + int mantSize; /* Number of digits in mantissa. */ + int decPt; /* Number of mantissa digits BEFORE decimal + * point. */ + int pExp; /* Temporarily holds location of exponent + * in string. */ + + /* + * Strip off leading blanks and check for a sign. + */ + + while(p < end && Character.isWhitespace(utf8[p])) { + p++; + } + while(end > p && Character.isWhitespace(utf8[end - 1])) { + end--; + } + if (!testSimpleDecimal(utf8, p, length)) { + return Double.parseDouble(new String(utf8, p, end-p, StandardCharsets.UTF_8)); + } + + if (utf8[p] == '-') { + signIsNegative = true; + p += 1; + } else { + if (utf8[p] == '+') { + p += 1; + } + signIsNegative = false; + } + + /* + * Count the number of digits in the mantissa (including the decimal + * point), and also locate the decimal point. + */ + + decPt = -1; + int mantEnd = end - p; + for (mantSize = 0; mantSize < mantEnd; mantSize += 1) + { + c = utf8[p]; + if (!isdigit(c)) { + if ((c != '.') || (decPt >= 0)) { + break; + } + decPt = mantSize; + } + p += 1; + } + + /* + * Now suck up the digits in the mantissa. Use two integers to + * collect 9 digits each (this is faster than using floating-point). + * If the mantissa has more than 18 digits, ignore the extras, since + * they can't affect the value anyway. + */ + + pExp = p; + p -= mantSize; + if (decPt < 0) { + decPt = mantSize; + } else { + mantSize -= 1; /* One of the digits was the point. */ + } + if (mantSize > 18) { + fracExp = decPt - 18; + mantSize = 18; + } else { + fracExp = decPt - mantSize; + } + if (mantSize == 0) { + if (signIsNegative) { + return -0.0d; + } + return 0.0d; + } else { + double frac1, frac2; + frac1 = 0; + for (; mantSize > 9; mantSize -= 1) + { + c = utf8[p]; + p += 1; + if (c == '.') { + c = utf8[p]; + p += 1; + } + frac1 = 10 * frac1 + (c - '0'); + } + frac2 = 0; + for (; mantSize > 0; mantSize -= 1) + { + c = utf8[p]; + p += 1; + if (c == '.') { + c = utf8[p]; + p += 1; + } + frac2 = 10 * frac2 + (c - '0'); + } + fraction = (1e9d * frac1) + frac2; + } + + /* + * Skim off the exponent. + */ + + p = pExp; + + if (p < end) { + if ((utf8[p] == 'E') || (utf8[p] == 'e')) { + p += 1; + if (p < end) { + if (utf8[p] == '-') { + expSignIsNegative = true; + p += 1; + } else { + if (utf8[p] == '+') { + p += 1; + } + expSignIsNegative = false; + } + while (p < end && isdigit(utf8[p])) { + exp = exp * 10 + (utf8[p] - '0'); + p += 1; + } + } + } + } + if (expSignIsNegative) { + exp = fracExp - exp; + } else { + exp = fracExp + exp; + } + + /* + * Generate a floating-point number that represents the exponent. + * Do this by processing the exponent one bit at a time to combine + * many powers of 2 of 10. Then combine the exponent with the + * fraction. + */ + + if (exp < 0) { + expSignIsNegative = true; + exp = -exp; + } else { + expSignIsNegative = false; + } + if (exp > maxExponent) { + exp = maxExponent; + } + + double dblExp = 1.0; + for (d = 0; exp != 0; exp >>= 1, d += 1) { + if ((exp & 1) == 1) { + dblExp *= powersOf10[d]; + } + } + + if (expSignIsNegative) { + fraction /= dblExp; + } else { + fraction *= dblExp; + } + + if (signIsNegative) { + return -fraction; + } + return fraction; + } + + private static boolean testSimpleDecimal(byte[] utf8, int off, int len) { + if (len > 18) { + return false; + } + int decimalPts = 0; + int signs = 0; + int nondigits = 0; + int digits = 0; + for (int i = off; i < (off + len); i++) { + final int c = utf8[i]; + if (c == '.') { + decimalPts++; + } else if (c == '-' || c == '+') { + signs++; + } else if (!isdigit(c)) { + // could be exponential notations + nondigits++; + } else { + digits++; + } + } + // There can be up to 5e-16 error + return (decimalPts <= 1 && signs <= 1 && nondigits == 0 && digits < 16); + } + + private static boolean isdigit(int c) { + return '0' <= c && c <= '9'; + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazy/fast/TestStringToDouble.java serde/src/test/org/apache/hadoop/hive/serde2/lazy/fast/TestStringToDouble.java new file mode 100644 index 0000000..4de16ef --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/fast/TestStringToDouble.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazy.fast; + +import org.junit.Test; + +import java.util.Random; + +import static org.junit.Assert.*; + +public class TestStringToDouble { + int iter = 10000; + + @Test + public void testFullRandom() throws Exception { + Random random = new Random(); + for (int i = 0; i < iter; i++) { + double d = Double.longBitsToDouble(random.nextLong()); + String s = Double.toString(d); + assertEquals(s, d, Double.parseDouble(s), Double.MIN_VALUE); + assertEquals(s, d, StringToDouble.strtod(s), Double.MIN_VALUE); + } + } + + @Test + public void testRandomBetween0And1() throws Exception { + Random random = new Random(); + for (int i = 0; i < iter; i++) { + for (int j = 1; j < 18; j++) { + StringBuilder builder = new StringBuilder("0.1"); + for (int k = 0; k < j; k++) { + builder.append(random.nextInt(10) + '0'); + } + String s = builder.toString(); + double d = Double.parseDouble(s); + assertEquals(s, d, Double.parseDouble(s), Double.MIN_VALUE); + assertEquals(s, d, StringToDouble.strtod(s), Double.MIN_VALUE); + } + } + } + + @Test + public void testRandomInteger() throws Exception { + Random random = new Random(); + for (int i = 0; i < iter; i++) { + for (int j = 1; j < 18; j++) { + StringBuilder builder = new StringBuilder(); + for (int k = 0; k < j; k++) { + builder.append(random.nextInt(10) + '0'); + } + String s = builder.toString(); + double d = Double.parseDouble(s); + assertEquals(s, d, Double.parseDouble(s), Double.MIN_VALUE); + assertEquals(s, d, StringToDouble.strtod(s), Double.MIN_VALUE); + } + } + } +} \ No newline at end of file