diff --git ql/pom.xml ql/pom.xml index 3632a5efe4..aa23d5e6e8 100644 --- ql/pom.xml +++ ql/pom.xml @@ -98,6 +98,11 @@ hive-spark-client ${project.version} + + org.apache.hive + hive-storage-api + ${project.version} + com.esotericsoftware diff --git serde/pom.xml serde/pom.xml index 9f1b146d02..e97551bbeb 100644 --- serde/pom.xml +++ serde/pom.xml @@ -49,6 +49,11 @@ hive-shims ${project.version} + + org.apache.hive + hive-storage-api + ${project.version} + com.google.code.findbugs diff --git storage-api/src/java/org/apache/hadoop/hive/common/CalendarUtils.java storage-api/src/java/org/apache/hadoop/hive/common/CalendarUtils.java new file mode 100644 index 0000000000..c73d02eed0 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/common/CalendarUtils.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.type; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +/** + * Conversion utilities from the hybrid Julian/Gregorian calendar to/from the + * proleptic Gregorian. + * + * The semantics here are to hold the string representation constant and change + * the epoch offset rather than holding the instant in time constant and change + * the string representation. + * + * These utilities will be fast for the common case (> 1582 AD), but slow for + * old dates. + */ +public class CalendarUtils { + + public static final long SWITCHOVER_MILLIS; + public static final long SWITCHOVER_DAYS; + + private static SimpleDateFormat createFormatter(String fmt, boolean proleptic) { + SimpleDateFormat result = new SimpleDateFormat(fmt); + GregorianCalendar calendar = new GregorianCalendar(UTC); + if (proleptic) { + calendar.setGregorianChange(new Date(Long.MIN_VALUE)); + } + result.setCalendar(calendar); + return result; + } + + private static final String DATE = "yyyy-MM-dd"; + private static final String TIME = DATE + " HH:mm:ss.SSS"; + private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); + private static final ThreadLocal HYBRID_DATE_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(DATE, false)); + private static final ThreadLocal HYBRID_TIME_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(TIME, false)); + + private static final ThreadLocal PROLEPTIC_DATE_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(DATE, true)); + private static final ThreadLocal PROLEPTIC_TIME_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(TIME, true)); + + static { + // Get the last day where the two calendars agree with each other. + try { + SWITCHOVER_MILLIS = HYBRID_DATE_FORMAT.get().parse("1582-10-15").getTime(); + SWITCHOVER_DAYS = TimeUnit.MILLISECONDS.toDays(SWITCHOVER_MILLIS); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse switch over date", e); + } + } + + /** + * Convert an epoch day from the hybrid Julian/Gregorian calendar to the + * proleptic Gregorian. + * @param hybrid day of epoch in the hybrid Julian/Gregorian + * @return day of epoch in the proleptic Gregorian + */ + public static int convertDateToProleptic(int hybrid) { + int proleptic = hybrid; + if (hybrid < SWITCHOVER_DAYS) { + String dateStr = HYBRID_DATE_FORMAT.get().format( + new Date(TimeUnit.DAYS.toMillis(hybrid))); + try { + proleptic = (int) TimeUnit.MILLISECONDS.toDays( + PROLEPTIC_DATE_FORMAT.get().parse(dateStr).getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return proleptic; + } + + /** + * Convert an epoch day from the proleptic Gregorian calendar to the hybrid + * Julian/Gregorian. + * @param proleptic day of epoch in the proleptic Gregorian + * @return day of epoch in the hybrid Julian/Gregorian + */ + public static int convertDateToHybrid(int proleptic) { + int hyrbid = proleptic; + if (proleptic < SWITCHOVER_DAYS) { + String dateStr = PROLEPTIC_DATE_FORMAT.get().format( + new Date(TimeUnit.DAYS.toMillis(proleptic))); + try { + hyrbid = (int) TimeUnit.MILLISECONDS.toDays( + HYBRID_DATE_FORMAT.get().parse(dateStr).getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return hyrbid; + } + + public static int convertDate(int original, + boolean fromProleptic, + boolean toProleptic) { + if (fromProleptic != toProleptic) { + return toProleptic + ? convertDateToProleptic(original) + : convertDateToHybrid(original); + } else { + return original; + } + } + + public static long convertTime(long original, + boolean fromProleptic, + boolean toProleptic) { + if (fromProleptic != toProleptic) { + return toProleptic + ? convertTimeToProleptic(original) + : convertTimeToHybrid(original); + } else { + return original; + } + } + /** + * Convert epoch millis from the hybrid Julian/Gregorian calendar to the + * proleptic Gregorian. + * @param hybrid millis of epoch in the hybrid Julian/Gregorian + * @return millis of epoch in the proleptic Gregorian + */ + public static long convertTimeToProleptic(long hybrid) { + long proleptic = hybrid; + if (hybrid < SWITCHOVER_MILLIS) { + String dateStr = HYBRID_TIME_FORMAT.get().format(new Date(hybrid)); + try { + proleptic = PROLEPTIC_TIME_FORMAT.get().parse(dateStr).getTime(); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return proleptic; + } + + /** + * Convert epoch millis from the proleptic Gregorian calendar to the hybrid + * Julian/Gregorian. + * @param proleptic millis of epoch in the proleptic Gregorian + * @return millis of epoch in the hybrid Julian/Gregorian + */ + public static long convertTimeToHybrid(long proleptic) { + long hybrid = proleptic; + if (proleptic < SWITCHOVER_MILLIS) { + String dateStr = PROLEPTIC_TIME_FORMAT.get().format(new Date(proleptic)); + try { + hybrid = HYBRID_TIME_FORMAT.get().parse(dateStr).getTime(); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return hybrid; + } + + /** + * + * Formats epoch day to date according to proleptic or hybrid calendar + * + * @param epochDay epoch day + * @param useProleptic if true - uses proleptic formatter, else uses hybrid formatter + * @return formatted date + */ + public static String formatDate(long epochDay, boolean useProleptic) { + long millis = TimeUnit.DAYS.toMillis(epochDay); + return useProleptic ? PROLEPTIC_DATE_FORMAT.get().format(millis) + : HYBRID_DATE_FORMAT.get().format(millis); + } + + private CalendarUtils() { + throw new UnsupportedOperationException(); + } +} \ No newline at end of file diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java index 3dac667f5d..281860d9db 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java @@ -17,10 +17,11 @@ */ package org.apache.hadoop.hive.ql.exec.vector; +import org.apache.hadoop.hive.common.type.CalendarUtils; + import java.text.SimpleDateFormat; import java.util.GregorianCalendar; import java.util.TimeZone; -import java.util.concurrent.TimeUnit; /** * This class extends LongColumnVector in order to introduce some date-specific semantics. In @@ -79,21 +80,13 @@ private void updateDataAccordingProlepticSetting() throws Exception { if (vector[i] >= CUTOVER_DAY_EPOCH) { // no need for conversion continue; } - long millis = TimeUnit.DAYS.toMillis(vector[i]); - String originalFormatted = usingProlepticCalendar ? GREGORIAN_DATE_FORMATTER.format(millis) - : PROLEPTIC_GREGORIAN_DATE_FORMATTER.format(millis); - - millis = (usingProlepticCalendar ? PROLEPTIC_GREGORIAN_DATE_FORMATTER.parse(originalFormatted) - : GREGORIAN_DATE_FORMATTER.parse(originalFormatted)).getTime(); - - vector[i] = TimeUnit.MILLISECONDS.toDays(millis); + vector[i] = usingProlepticCalendar ? CalendarUtils.convertDateToProleptic((int) vector[i]) : CalendarUtils + .convertDateToHybrid((int) vector[i]); } } public String formatDate(int i) { - long millis = TimeUnit.DAYS.toMillis(vector[i]); - return usingProlepticCalendar ? PROLEPTIC_GREGORIAN_DATE_FORMATTER.format(millis) - : GREGORIAN_DATE_FORMATTER.format(millis); + return CalendarUtils.formatDate(vector[i], usingProlepticCalendar); } public DateColumnVector setUsingProlepticCalendar(boolean usingProlepticCalendar) { diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java index d5dfc9295a..7807e69ffe 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java @@ -18,14 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector; import java.sql.Timestamp; -import java.text.SimpleDateFormat; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.util.Arrays; -import java.util.GregorianCalendar; -import java.util.TimeZone; +import org.apache.hadoop.hive.common.type.CalendarUtils; import org.apache.hadoop.io.Writable; /** @@ -41,26 +39,6 @@ * using the scratch timestamp, and then perhaps update the column vector row with a result. */ public class TimestampColumnVector extends ColumnVector { - private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); - private static final GregorianCalendar PROLEPTIC_GREGORIAN_CALENDAR_UTC = - new GregorianCalendar(UTC); - private static final GregorianCalendar GREGORIAN_CALENDAR_UTC = - new GregorianCalendar(UTC); - - private static final SimpleDateFormat PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC = - new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - private static final SimpleDateFormat GREGORIAN_TIMESTAMP_FORMATTER_UTC = - new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - - static { - PROLEPTIC_GREGORIAN_CALENDAR_UTC.setGregorianChange(new java.util.Date(Long.MIN_VALUE)); - - PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.setCalendar(PROLEPTIC_GREGORIAN_CALENDAR_UTC); - GREGORIAN_TIMESTAMP_FORMATTER_UTC.setCalendar(GREGORIAN_CALENDAR_UTC); - } - - // it's 1582-10-15 in both calendars - private static final int CUTOVER_MILLIS_EPOCH = -141427 * 24 * 60 * 60 * 1000; /* * The storage arrays for this column vector corresponds to the storage of a Timestamp: @@ -594,18 +572,14 @@ public void changeCalendar(boolean useProleptic, boolean updateData) { private void updateDataAccordingProlepticSetting() throws Exception { for (int i = 0; i < nanos.length; i++) { - if (time[i] >= CUTOVER_MILLIS_EPOCH) { // no need for conversion + if (time[i] >= CalendarUtils.SWITCHOVER_MILLIS) { // no need for conversion continue; } asScratchTimestamp(i); long offset = 0; - String formatted = - usingProlepticCalendar ? GREGORIAN_TIMESTAMP_FORMATTER_UTC.format(scratchTimestamp) - : PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.format(scratchTimestamp); - long millis = usingProlepticCalendar - ? PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.parse(formatted).getTime() - : GREGORIAN_TIMESTAMP_FORMATTER_UTC.parse(formatted).getTime(); + long millis = usingProlepticCalendar ? CalendarUtils.convertTimeToProleptic(scratchTimestamp.getTime()) + : CalendarUtils.convertTimeToHybrid(scratchTimestamp.getTime()); Timestamp newTimeStamp = Timestamp.from(Instant.ofEpochMilli(millis)); diff --git storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java index 0d4dc5dc38..d45822d172 100644 --- storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java +++ storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java @@ -20,6 +20,9 @@ import org.junit.Assert; import org.junit.Test; +import java.util.ArrayList; +import java.util.List; + public class TestDateColumnVector { /** * Test case for DateColumnVector's changeCalendar @@ -77,4 +80,44 @@ private void setDateAndVerifyProlepticUpdate(long longDay, String expectedDateSt " new = " + newUseProleptic, expectedDateString, dateColumnVector.formatDate(0)); } + + @Test(timeout = 300_000) + public void testMultiThreaded() throws Exception { + + //when java DateTimeFormatter/GregorianCalendar race was not handled, used to throw exceptions like - + + // 1) java.lang.NumberFormatException: For input string: "" OR java.lang.NumberFormatException: For input string: ".821582E.821582E44" + + // 2) Caused by: java.lang.ArrayIndexOutOfBoundsException: -5325980 + // at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453) + // at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397) + + // create 5 threads and start manipulating vectors, should not throw any exceptions now. + + List threads = new ArrayList<>(); + + threads.add(startVectorManipulationThread(50000, -141428)); + threads.add(startVectorManipulationThread(50000, -141430)); + threads.add(startVectorManipulationThread(50000, -16768)); + threads.add(startVectorManipulationThread(50000, -499952)); + threads.add(startVectorManipulationThread(50000, -499955)); + + for (Thread thread : threads) { + thread.join(); + } + + } + + private Thread startVectorManipulationThread(final int vectorLength, final int epochDay) { + Thread thread = new Thread(() -> { + DateColumnVector columnVector = new DateColumnVector(vectorLength).setUsingProlepticCalendar(true); + for (int i = 0; i < vectorLength; i++) { + columnVector.vector[i] = epochDay; + } + columnVector.changeCalendar(false, true); + }); + thread.start(); + return thread; + } + } diff --git storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java index 333a5b57ad..2d85b115d2 100644 --- storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java +++ storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java @@ -24,7 +24,9 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.time.Instant; +import java.util.ArrayList; import java.util.GregorianCalendar; +import java.util.List; import java.util.Random; import java.util.TimeZone; import java.util.concurrent.TimeUnit; @@ -210,4 +212,38 @@ private DateFormat getTestFormatter(boolean useProleptic) { return testFormatter; } + + + @Test(timeout = 300_000) + public void testMultiThreaded() throws Exception { + + // similar to TestDateColumnVector#testMultiThreaded + + List threads = new ArrayList<>(); + + threads.add(startVectorManipulationThread(50000, -141428)); + threads.add(startVectorManipulationThread(50000, -141430)); + threads.add(startVectorManipulationThread(50000, -16768)); + threads.add(startVectorManipulationThread(50000, -499952)); + threads.add(startVectorManipulationThread(50000, -499955)); + + for (Thread thread : threads) { + thread.join(); + } + + } + + private Thread startVectorManipulationThread(final int vectorLength, final long millis) { + Thread thread = new Thread(() -> { + TimestampColumnVector columnVector = new TimestampColumnVector(vectorLength).setUsingProlepticCalendar(true); + for (int i = 0; i < vectorLength; i++) { + columnVector.time[i] = millis; + columnVector.nanos[i] = 1; + } + columnVector.changeCalendar(false, true); + }); + thread.start(); + return thread; + } + }