From 6f1b41b272bce939c82e5be8580d9285d52a158c Mon Sep 17 00:00:00 2001 From: Gopal V Date: Sat, 28 May 2016 22:55:38 -0700 Subject: [PATCH] TestLazyTimeStamp fix --- .../hive/benchmark/serde/LazySimpleSerDeBench.java | 173 ++++++++++++++++++++- .../apache/hadoop/hive/serde2/lazy/LazyDate.java | 4 + .../hadoop/hive/serde2/lazy/LazyTimestamp.java | 4 + .../apache/hadoop/hive/serde2/lazy/LazyUtils.java | 12 ++ .../lazy/fast/LazySimpleDeserializeRead.java | 11 +- 5 files changed, 199 insertions(+), 5 deletions(-) diff --git itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java index a1b63d5..826bf53 100644 --- itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java +++ itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java @@ -15,16 +15,19 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.Date; import java.util.Random; import java.util.concurrent.TimeUnit; import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; import org.apache.hadoop.hive.serde2.lazy.LazyByte; +import org.apache.hadoop.hive.serde2.lazy.LazyDate; import org.apache.hadoop.hive.serde2.lazy.LazyDouble; import org.apache.hadoop.hive.serde2.lazy.LazyFloat; import org.apache.hadoop.hive.serde2.lazy.LazyInteger; import org.apache.hadoop.hive.serde2.lazy.LazyLong; import org.apache.hadoop.hive.serde2.lazy.LazyShort; +import org.apache.hadoop.hive.serde2.lazy.LazyTimestamp; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -53,15 +56,14 @@ * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.serde.LazySimpleSerDeBench *

*/ + public static final int DEFAULT_ITER_TIME = 1000000; + public static final int DEFAULT_DATA_SIZE = 4096; @BenchmarkMode(Mode.AverageTime) @Fork(1) @State(Scope.Thread) @OutputTimeUnit(TimeUnit.NANOSECONDS) public static abstract class AbstractDeserializer { - public static final int DEFAULT_ITER_TIME = 1000000; - - public static final int DEFAULT_DATA_SIZE = 4096; public int[] offsets = new int[DEFAULT_DATA_SIZE]; public int[] sizes = new int[DEFAULT_DATA_SIZE]; @@ -445,6 +447,171 @@ public void bench() { } } + @BenchmarkMode(Mode.AverageTime) + @Fork(1) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public static class GoodLazyDate { + + final LazyDate obj = new LazyDate( + LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR); + + public int[] offsets = new int[DEFAULT_DATA_SIZE]; + public int[] sizes = new int[DEFAULT_DATA_SIZE]; + protected final ByteArrayRef ref = new ByteArrayRef(); + + @Setup + public void setup() { + sizes = new int[DEFAULT_DATA_SIZE]; + offsets = new int[sizes.length]; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + Random r = new Random(); + int len = 0; + final long base = -320000000L*1000L; // 1959 + for (int i = 0; i < DEFAULT_DATA_SIZE; i++) { + // -ve dates are also valid dates - the dates are within 1959 to 2027 + Date dt = new Date(base + (Math.abs(r.nextLong()) % (Integer.MAX_VALUE*1000L))); + byte[] ds = dt.toString().getBytes(); + sizes[i] = ds.length; + offsets[i] = len; + len += ds.length; + try { + bos.write(ds); + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + ref.setData(bos.toByteArray()); + } + + @Benchmark + @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) + @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyDate extends RandomDataInitializer { + + final LazyDate obj = new LazyDate( + LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR); + + public RandomLazyDate() { + super(4); + } + + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyDate extends RandomDataInitializer { + + final LazyDate obj = new LazyDate( + LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR); + + public WorstLazyDate() { + super(8); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + @BenchmarkMode(Mode.AverageTime) + @Fork(1) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public static class GoodLazyTimestamp { + + final LazyTimestamp obj = new LazyTimestamp( + LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR); + + public int[] offsets = new int[DEFAULT_DATA_SIZE]; + public int[] sizes = new int[DEFAULT_DATA_SIZE]; + protected final ByteArrayRef ref = new ByteArrayRef(); + + @Setup + public void setup() { + sizes = new int[DEFAULT_DATA_SIZE]; + offsets = new int[sizes.length]; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + Random r = new Random(); + int len = 0; + final long base = -320000000L * 1000L; // 1959 + for (int i = 0; i < DEFAULT_DATA_SIZE; i++) { + // -ve dates are also valid Timestamps - dates are within 1959 to 2027 + Date dt = new Date(base + (Math.abs(r.nextLong()) % (Integer.MAX_VALUE * 1000L))); + byte[] ds = String.format("%s 00:00:01", dt.toString()).getBytes(); + sizes[i] = ds.length; + offsets[i] = len; + len += ds.length; + try { + bos.write(ds); + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + ref.setData(bos.toByteArray()); + } + + @Benchmark + @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) + @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS) + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class RandomLazyTimestamp extends RandomDataInitializer { + + final LazyTimestamp obj = new LazyTimestamp( + LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR); + + public RandomLazyTimestamp() { + super(4); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + + public static class WorstLazyTimestamp extends RandomDataInitializer { + + final LazyTimestamp obj = new LazyTimestamp( + LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR); + + public WorstLazyTimestamp() { + super(8); + } + + @Override + public void bench() { + for (int i = 0; i < DEFAULT_ITER_TIME; i++) { + obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]); + } + } + } + public static void main(String[] args) throws RunnerException { Options opt = new OptionsBuilder().include( ".*" + LazySimpleSerDeBench.class.getSimpleName() + ".*").build(); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java index 0579ff2..c00faac 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java @@ -59,6 +59,10 @@ public LazyDate(LazyDate copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { String s = null; + if (!LazyUtils.isDateMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { s = Text.decode(bytes.getData(), start, length); data.set(Date.valueOf(s)); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java index 8f0c3d2..56945d1 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java @@ -59,6 +59,10 @@ public LazyTimestamp(LazyTimestamp copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { String s = null; + if (!LazyUtils.isDateMaybe(bytes.getData(), start, length)) { + isNull = true; + return; + } try { s = new String(bytes.getData(), start, length, "US-ASCII"); } catch (UnsupportedEncodingException e) { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java index 6d7369b..73c72e1 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java @@ -109,6 +109,18 @@ public static boolean isNumberMaybe(byte[] buf, int offset, int len) { } /** + * returns false, when the bytes definitely cannot be parsed into a date/timestamp. + * + * Y2k requirements and dash requirements say the string has to be at least + * yyyy-m-m = 8 bytes or more minimum; Timestamp needs to be at least 1 byte longer, + * but the Date check is necessary, but not sufficient. + */ + public static boolean isDateMaybe(byte[] buf, int offset, int len) { + // maybe valid - too expensive to check without a parse + return len >= 8; + } + + /** * Returns -1 if the first byte sequence is lexicographically less than the * second; returns +1 if the second byte sequence is lexicographically less * than the first; otherwise return 0. diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index 7e9f94e..765ba7e 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -21,6 +21,7 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.CharacterCodingException; import java.sql.Date; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -381,7 +382,7 @@ public boolean readCheckNull() { break; case DATE: { - if (fieldLength == 0) { + if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) { return true; } String s = null; @@ -396,7 +397,7 @@ public boolean readCheckNull() { break; case TIMESTAMP: { - if (fieldLength == 0) { + if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) { return true; } String s = null; @@ -425,6 +426,9 @@ public boolean readCheckNull() { break; case INTERVAL_YEAR_MONTH: { + if (fieldLength == 0) { + return true; + } String s = null; try { s = Text.decode(bytes, fieldStart, fieldLength); @@ -437,6 +441,9 @@ public boolean readCheckNull() { break; case INTERVAL_DAY_TIME: { + if (fieldLength == 0) { + return true; + } String s = null; try { s = Text.decode(bytes, fieldStart, fieldLength); -- 2.4.0