diff --git common/src/java/org/apache/hive/common/util/DateUtils.java common/src/java/org/apache/hive/common/util/DateUtils.java index a1068ecce94e9ff1ae78008a0d8c6d67ca4f2690..0315ec67f0780e8e5a22009f032fa4054ec00770 100644 --- common/src/java/org/apache/hive/common/util/DateUtils.java +++ common/src/java/org/apache/hive/common/util/DateUtils.java @@ -20,6 +20,7 @@ import java.math.BigDecimal; import java.text.SimpleDateFormat; +import java.util.TimeZone; /** * DateUtils. Thread-safe class @@ -74,4 +75,17 @@ public static int parseNumericValueWithRange(String fieldName, public static String getFieldName(int field) { return FIELD_NAME[field]; } + + /** + * Check if the string id is a valid java TimeZone id. + * TimeZone#getTimeZone will return "GMT" if the id cannot be understood. + * @param timeZoneID + */ + public static void validateTimeZone(String timeZoneID) { + if (TimeZone.getTimeZone(timeZoneID).getID().equals("GMT") + && !"GMT".equals(timeZoneID)) { + throw new IllegalStateException( + "Unexpected timezone id found for parquet int96 conversion: " + timeZoneID); + } + } } diff --git common/src/test/org/apache/hive/common/util/TestDateUtils.java common/src/test/org/apache/hive/common/util/TestDateUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..03a2d57431b0f15d06cd679e388fc625cf64cc92 --- /dev/null +++ common/src/test/org/apache/hive/common/util/TestDateUtils.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hive.common.util; + +import org.junit.Test; + +public class TestDateUtils { + + @Test + public void testTimeZoneValidationWithCorrectZoneId() { + DateUtils.validateTimeZone("GMT"); + DateUtils.validateTimeZone("UTC"); + DateUtils.validateTimeZone("GMT+10"); + DateUtils.validateTimeZone("Europe/Budapest"); + } + + @Test(expected = IllegalStateException.class) + public void testTimeZoneValidationWithIncorrectZoneId() { + DateUtils.validateTimeZone("UCC"); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java index 26f1e75c7d659a634cd4eef3a0cb8e886b22722f..b2c2a7a1300fbd442a4647aef9c6f0b5c72e3d1c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java @@ -21,6 +21,7 @@ import java.util.TimeZone; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils; +import org.apache.hive.common.util.DateUtils; import org.apache.parquet.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -139,14 +140,11 @@ private TimeZone getParquetWriterTimeZone(Properties tableProperties) { String timeZoneID = tableProperties.getProperty(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY); if (!Strings.isNullOrEmpty(timeZoneID)) { - if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) { - throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID); - } + + DateUtils.validateTimeZone(timeZoneID); return TimeZone.getTimeZone(timeZoneID); } - // If no timezone is defined in table properties, then adjust timestamps using - // PARQUET_INT96_NO_ADJUSTMENT_ZONE timezone - return TimeZone.getTimeZone(ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE); + return TimeZone.getDefault(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java index 8e33b7d437894b33b35f32913a3bc02f2a849ce3..b278ad06f842998fe6dabb4e8216e126f84d51de 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; +import org.apache.hive.common.util.DateUtils; import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.compat.RowGroupFilter; import org.apache.parquet.filter2.predicate.FilterPredicate; @@ -44,7 +45,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.TimeZone; @@ -170,7 +170,7 @@ protected void setTimeZoneConversion(Configuration configuration, Path finalPath boolean skipConversion = HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); - if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") || + if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") && skipConversion) { // Impala writes timestamp values using GMT only. We should not try to convert Impala // files to other type of timezones. @@ -179,16 +179,12 @@ protected void setTimeZoneConversion(Configuration configuration, Path finalPath // TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion // to use when reading Parquet timestamps. timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, - ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE); - - if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) { - throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID); - } + TimeZone.getDefault().getID()); + DateUtils.validateTimeZone(timeZoneID); } // 'timeZoneID' should be valid, since we did not throw exception above - configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, - TimeZone.getTimeZone(timeZoneID).getID()); + configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY,timeZoneID); } public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java index 5dc808800290f3274afbdff12134ac34387a746b..f2f9035dfeb9ea492aea1b7088d54dcd5f293e0f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java @@ -152,7 +152,7 @@ public static Timestamp getTimestamp(NanoTime nt, Calendar calendar) { calendar.setTimeInMillis(utcCalendar.getTimeInMillis()); - Calendar adjusterCalendar = copyToCalendarWithTZ(calendar, Calendar.getInstance()); + Calendar adjusterCalendar = copyToCalendarWithTZ(calendar, getLocalCalendar()); Timestamp ts = new Timestamp(adjusterCalendar.getTimeInMillis()); ts.setNanos((int) nanos); diff --git ql/src/test/queries/clientpositive/parquet_int96_timestamp.q ql/src/test/queries/clientpositive/parquet_int96_timestamp.q index 5de2c3f1244b8340b97eb0547fe66e52d80fb065..6eadd1b0a3313cbba7a798890b802baae302749e 100644 --- ql/src/test/queries/clientpositive/parquet_int96_timestamp.q +++ ql/src/test/queries/clientpositive/parquet_int96_timestamp.q @@ -2,7 +2,7 @@ create table dummy (id int); insert into table dummy values (1); set hive.parquet.mr.int96.enable.utc.write.zone=true; -set hive.parquet.timestamp.skip.conversion=false; +set hive.parquet.timestamp.skip.conversion=true; -- read/write timestamps using UTC as default write zone create table timestamps (ts timestamp) stored as parquet;