diff --git ql/pom.xml ql/pom.xml index 8b0c02b9b1..d5c83b67ff 100644 --- ql/pom.xml +++ ql/pom.xml @@ -98,6 +98,10 @@ hive-spark-client ${project.version} + + org.apache.hive + hive-storage-api + com.esotericsoftware diff --git serde/pom.xml serde/pom.xml index 10fa1b7c1e..ca4fee27a5 100644 --- serde/pom.xml +++ serde/pom.xml @@ -49,6 +49,10 @@ hive-shims ${project.version} + + org.apache.hive + hive-storage-api + com.google.code.findbugs diff --git common/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java storage-api/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java similarity index 84% rename from common/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java rename to storage-api/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java index 9b491d0683..7555d16478 100644 --- common/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java +++ storage-api/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java @@ -37,9 +37,15 @@ */ public class CalendarUtils { - private static SimpleDateFormat createFormatter(String fmt, - GregorianCalendar calendar) { + public static final long SWITCHOVER_MILLIS; + public static final long SWITCHOVER_DAYS; + + private static SimpleDateFormat createFormatter(String fmt, boolean proleptic) { SimpleDateFormat result = new SimpleDateFormat(fmt); + GregorianCalendar calendar = new GregorianCalendar(UTC); + if (proleptic) { + calendar.setGregorianChange(new Date(Long.MIN_VALUE)); + } result.setCalendar(calendar); return result; } @@ -47,24 +53,17 @@ private static SimpleDateFormat createFormatter(String fmt, private static final String DATE = "yyyy-MM-dd"; private static final String TIME = DATE + " HH:mm:ss.SSS"; private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); - private static final GregorianCalendar HYBRID = new GregorianCalendar(); private static final ThreadLocal HYBRID_DATE_FORMAT = - ThreadLocal.withInitial(() -> createFormatter(DATE, HYBRID)); + ThreadLocal.withInitial(() -> createFormatter(DATE, false)); private static final ThreadLocal HYBRID_TIME_FORMAT = - ThreadLocal.withInitial(() -> createFormatter(TIME, HYBRID)); - private static final long SWITCHOVER_MILLIS; - private static final long SWITCHOVER_DAYS; - private static final GregorianCalendar PROLEPTIC = new GregorianCalendar(); + ThreadLocal.withInitial(() -> createFormatter(TIME, false)); + private static final ThreadLocal PROLEPTIC_DATE_FORMAT = - ThreadLocal.withInitial(() -> createFormatter(DATE, PROLEPTIC)); + ThreadLocal.withInitial(() -> createFormatter(DATE, true)); private static final ThreadLocal PROLEPTIC_TIME_FORMAT = - ThreadLocal.withInitial(() -> createFormatter(TIME, PROLEPTIC)); + ThreadLocal.withInitial(() -> createFormatter(TIME, true)); static { - HYBRID.setTimeZone(UTC); - PROLEPTIC.setTimeZone(UTC); - PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE)); - // Get the last day where the two calendars agree with each other. try { SWITCHOVER_MILLIS = HYBRID_DATE_FORMAT.get().parse("1582-10-15").getTime(); @@ -177,6 +176,20 @@ public static long convertTimeToHybrid(long proleptic) { return hybrid; } + /** + * + * Formats epoch day to date according to proleptic or hybrid calendar + * + * @param epochDay epoch day + * @param useProleptic if true - uses proleptic formatter, else uses hybrid formatter + * @return formatted date + */ + public static String formatDate(long epochDay, boolean useProleptic) { + long millis = TimeUnit.DAYS.toMillis(epochDay); + return useProleptic ? PROLEPTIC_DATE_FORMAT.get().format(millis) + : HYBRID_DATE_FORMAT.get().format(millis); + } + private CalendarUtils() { throw new UnsupportedOperationException(); } diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java index 3dac667f5d..6608a6eac3 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java @@ -17,37 +17,13 @@ */ package org.apache.hadoop.hive.ql.exec.vector; -import java.text.SimpleDateFormat; -import java.util.GregorianCalendar; -import java.util.TimeZone; -import java.util.concurrent.TimeUnit; +import org.apache.hadoop.hive.common.type.CalendarUtils; /** * This class extends LongColumnVector in order to introduce some date-specific semantics. In * DateColumnVector, the elements of vector[] represent the days since 1970-01-01 */ public class DateColumnVector extends LongColumnVector { - private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); - private static final GregorianCalendar PROLEPTIC_GREGORIAN_CALENDAR = new GregorianCalendar(UTC); - private static final GregorianCalendar GREGORIAN_CALENDAR = new GregorianCalendar(UTC); - - private static final SimpleDateFormat PROLEPTIC_GREGORIAN_DATE_FORMATTER = - new SimpleDateFormat("yyyy-MM-dd"); - private static final SimpleDateFormat GREGORIAN_DATE_FORMATTER = - new SimpleDateFormat("yyyy-MM-dd"); - - /** - * -141427: hybrid: 1582-10-15 proleptic: 1582-10-15 - * -141428: hybrid: 1582-10-04 proleptic: 1582-10-14 - */ - private static final int CUTOVER_DAY_EPOCH = -141427; // it's 1582-10-15 in both calendars - - static { - PROLEPTIC_GREGORIAN_CALENDAR.setGregorianChange(new java.util.Date(Long.MIN_VALUE)); - - PROLEPTIC_GREGORIAN_DATE_FORMATTER.setCalendar(PROLEPTIC_GREGORIAN_CALENDAR); - GREGORIAN_DATE_FORMATTER.setCalendar(GREGORIAN_CALENDAR); - } private boolean usingProlepticCalendar = false; @@ -76,24 +52,16 @@ public void changeCalendar(boolean useProleptic, boolean updateData) { private void updateDataAccordingProlepticSetting() throws Exception { for (int i = 0; i < vector.length; i++) { - if (vector[i] >= CUTOVER_DAY_EPOCH) { // no need for conversion + if (vector[i] >= CalendarUtils.SWITCHOVER_DAYS) { // no need for conversion continue; } - long millis = TimeUnit.DAYS.toMillis(vector[i]); - String originalFormatted = usingProlepticCalendar ? GREGORIAN_DATE_FORMATTER.format(millis) - : PROLEPTIC_GREGORIAN_DATE_FORMATTER.format(millis); - - millis = (usingProlepticCalendar ? PROLEPTIC_GREGORIAN_DATE_FORMATTER.parse(originalFormatted) - : GREGORIAN_DATE_FORMATTER.parse(originalFormatted)).getTime(); - - vector[i] = TimeUnit.MILLISECONDS.toDays(millis); + vector[i] = usingProlepticCalendar ? CalendarUtils.convertDateToProleptic((int) vector[i]) : CalendarUtils + .convertDateToHybrid((int) vector[i]); } } public String formatDate(int i) { - long millis = TimeUnit.DAYS.toMillis(vector[i]); - return usingProlepticCalendar ? PROLEPTIC_GREGORIAN_DATE_FORMATTER.format(millis) - : GREGORIAN_DATE_FORMATTER.format(millis); + return CalendarUtils.formatDate(vector[i], usingProlepticCalendar); } public DateColumnVector setUsingProlepticCalendar(boolean usingProlepticCalendar) { diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java index d5dfc9295a..7807e69ffe 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java @@ -18,14 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector; import java.sql.Timestamp; -import java.text.SimpleDateFormat; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.util.Arrays; -import java.util.GregorianCalendar; -import java.util.TimeZone; +import org.apache.hadoop.hive.common.type.CalendarUtils; import org.apache.hadoop.io.Writable; /** @@ -41,26 +39,6 @@ * using the scratch timestamp, and then perhaps update the column vector row with a result. */ public class TimestampColumnVector extends ColumnVector { - private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); - private static final GregorianCalendar PROLEPTIC_GREGORIAN_CALENDAR_UTC = - new GregorianCalendar(UTC); - private static final GregorianCalendar GREGORIAN_CALENDAR_UTC = - new GregorianCalendar(UTC); - - private static final SimpleDateFormat PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC = - new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - private static final SimpleDateFormat GREGORIAN_TIMESTAMP_FORMATTER_UTC = - new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - - static { - PROLEPTIC_GREGORIAN_CALENDAR_UTC.setGregorianChange(new java.util.Date(Long.MIN_VALUE)); - - PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.setCalendar(PROLEPTIC_GREGORIAN_CALENDAR_UTC); - GREGORIAN_TIMESTAMP_FORMATTER_UTC.setCalendar(GREGORIAN_CALENDAR_UTC); - } - - // it's 1582-10-15 in both calendars - private static final int CUTOVER_MILLIS_EPOCH = -141427 * 24 * 60 * 60 * 1000; /* * The storage arrays for this column vector corresponds to the storage of a Timestamp: @@ -594,18 +572,14 @@ public void changeCalendar(boolean useProleptic, boolean updateData) { private void updateDataAccordingProlepticSetting() throws Exception { for (int i = 0; i < nanos.length; i++) { - if (time[i] >= CUTOVER_MILLIS_EPOCH) { // no need for conversion + if (time[i] >= CalendarUtils.SWITCHOVER_MILLIS) { // no need for conversion continue; } asScratchTimestamp(i); long offset = 0; - String formatted = - usingProlepticCalendar ? GREGORIAN_TIMESTAMP_FORMATTER_UTC.format(scratchTimestamp) - : PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.format(scratchTimestamp); - long millis = usingProlepticCalendar - ? PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.parse(formatted).getTime() - : GREGORIAN_TIMESTAMP_FORMATTER_UTC.parse(formatted).getTime(); + long millis = usingProlepticCalendar ? CalendarUtils.convertTimeToProleptic(scratchTimestamp.getTime()) + : CalendarUtils.convertTimeToHybrid(scratchTimestamp.getTime()); Timestamp newTimeStamp = Timestamp.from(Instant.ofEpochMilli(millis)); diff --git storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java index 0d4dc5dc38..9f53245df2 100644 --- storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java +++ storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hive.ql.exec.vector; +import java.util.ArrayList; +import java.util.List; + import org.junit.Assert; import org.junit.Test; @@ -77,4 +80,44 @@ private void setDateAndVerifyProlepticUpdate(long longDay, String expectedDateSt " new = " + newUseProleptic, expectedDateString, dateColumnVector.formatDate(0)); } + + @Test(timeout = 300_000) + public void testMultiThreaded() throws Exception { + + //when java DateTimeFormatter/GregorianCalendar race was not handled, used to throw exceptions like - + + // 1) java.lang.NumberFormatException: For input string: "" OR java.lang.NumberFormatException: For input string: ".821582E.821582E44" + + // 2) Caused by: java.lang.ArrayIndexOutOfBoundsException: -5325980 + // at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453) + // at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397) + + // create 5 threads and start manipulating vectors, should not throw any exceptions now. + + List threads = new ArrayList<>(); + + threads.add(startVectorManipulationThread(50000, -141428)); + threads.add(startVectorManipulationThread(50000, -141430)); + threads.add(startVectorManipulationThread(50000, -16768)); + threads.add(startVectorManipulationThread(50000, -499952)); + threads.add(startVectorManipulationThread(50000, -499955)); + + for (Thread thread : threads) { + thread.join(); + } + + } + + private Thread startVectorManipulationThread(final int vectorLength, final int epochDay) { + Thread thread = new Thread(() -> { + DateColumnVector columnVector = new DateColumnVector(vectorLength).setUsingProlepticCalendar(true); + for (int i = 0; i < vectorLength; i++) { + columnVector.vector[i] = epochDay; + } + columnVector.changeCalendar(false, true); + }); + thread.start(); + return thread; + } + } diff --git storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java index 333a5b57ad..2d85b115d2 100644 --- storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java +++ storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java @@ -24,7 +24,9 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.time.Instant; +import java.util.ArrayList; import java.util.GregorianCalendar; +import java.util.List; import java.util.Random; import java.util.TimeZone; import java.util.concurrent.TimeUnit; @@ -210,4 +212,38 @@ private DateFormat getTestFormatter(boolean useProleptic) { return testFormatter; } + + + @Test(timeout = 300_000) + public void testMultiThreaded() throws Exception { + + // similar to TestDateColumnVector#testMultiThreaded + + List threads = new ArrayList<>(); + + threads.add(startVectorManipulationThread(50000, -141428)); + threads.add(startVectorManipulationThread(50000, -141430)); + threads.add(startVectorManipulationThread(50000, -16768)); + threads.add(startVectorManipulationThread(50000, -499952)); + threads.add(startVectorManipulationThread(50000, -499955)); + + for (Thread thread : threads) { + thread.join(); + } + + } + + private Thread startVectorManipulationThread(final int vectorLength, final long millis) { + Thread thread = new Thread(() -> { + TimestampColumnVector columnVector = new TimestampColumnVector(vectorLength).setUsingProlepticCalendar(true); + for (int i = 0; i < vectorLength; i++) { + columnVector.time[i] = millis; + columnVector.nanos[i] = 1; + } + columnVector.changeCalendar(false, true); + }); + thread.start(); + return thread; + } + }