diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index d0a552a90c..c51b367ddc 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2032,6 +2032,12 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal HIVE_INT_TIMESTAMP_CONVERSION_IN_SECONDS("hive.int.timestamp.conversion.in.seconds", false, "Boolean/tinyint/smallint/int/bigint value is interpreted as milliseconds during the timestamp conversion.\n" + "Set this flag to true to interpret the value as seconds to be consistent with float/double." ), + HIVE_PARQUET_WRITE_INT64_TIMESTAMP("hive.parquet.write.int64.timestamp", false, + "Write parquet timestamps as int64/LogicalTypes instead of int96/OriginalTypes. Note:" + + "Timestamps will be time zone agnostic (NEVER converted to a different time zone)."), + HIVE_PARQUET_TIMESTAMP_TIME_UNIT("hive.parquet.timestamp.time.unit", "millis", + new StringSet("nanos", "micros", "millis"), + "Store parquet int64/LogicalTypes timestamps in this time unit."), HIVE_ORC_BASE_DELTA_RATIO("hive.exec.orc.base.delta.ratio", 8, "The ratio of base writer and\n" + "delta writer in terms of STRIPE_SIZE and BUFFER_SIZE."), diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java index ba235f761d..4786305a41 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java @@ -118,7 +118,7 @@ public void checkOutputSpecs(final FileSystem ignored, final JobConf job) throws columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); } - DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes), jobConf); + DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes, jobConf), jobConf); return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress,tableProperties); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java index 21bfb2e1a2..a59c15ee5d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -15,6 +15,8 @@ import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; @@ -35,28 +37,36 @@ public class HiveSchemaConverter { - public static MessageType convert(final List columnNames, final List columnTypes) { - final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); + public static MessageType convert(final List columnNames, + final List columnTypes) { + return convert(columnNames, columnTypes, null); + } + + public static MessageType convert(final List columnNames, + final List columnTypes, Configuration conf) { + final MessageType schema = + new MessageType("hive_schema", convertTypes(columnNames, columnTypes, conf)); return schema; } - private static Type[] convertTypes(final List columnNames, final List columnTypes) { + private static Type[] convertTypes(final List columnNames, + final List columnTypes, Configuration conf) { if (columnNames.size() != columnTypes.size()) { throw new IllegalStateException("Mismatched Hive columns and types. Hive columns names" + " found : " + columnNames + " . And Hive types found : " + columnTypes); } final Type[] types = new Type[columnNames.size()]; for (int i = 0; i < columnNames.size(); ++i) { - types[i] = convertType(columnNames.get(i), columnTypes.get(i)); + types[i] = convertType(columnNames.get(i), columnTypes.get(i), conf); } return types; } - private static Type convertType(final String name, final TypeInfo typeInfo) { - return convertType(name, typeInfo, Repetition.OPTIONAL); + private static Type convertType(final String name, final TypeInfo typeInfo, Configuration conf) { + return convertType(name, typeInfo, conf, Repetition.OPTIONAL); } - private static Type convertType(final String name, final TypeInfo typeInfo, + private static Type convertType(final String name, final TypeInfo typeInfo, Configuration conf, final Repetition repetition) { if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { @@ -81,7 +91,23 @@ private static Type convertType(final String name, final TypeInfo typeInfo, } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { - return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); + boolean useInt64; + String timeUnitVal; + if (conf != null) { + useInt64 = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP); + timeUnitVal = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_TIME_UNIT); + } else { //use defaults + useInt64 = HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.defaultBoolVal; + timeUnitVal = HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_TIME_UNIT.defaultStrVal; + } + if (useInt64) { + LogicalTypeAnnotation.TimeUnit timeUnit = + LogicalTypeAnnotation.TimeUnit.valueOf(timeUnitVal.toUpperCase()); + return Types.primitive(PrimitiveTypeName.INT64, repetition) + .as(LogicalTypeAnnotation.timestampType(false, timeUnit)).named(name); + } else { + return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); + } } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { throw new UnsupportedOperationException("Void type not implemented"); } else if (typeInfo.getTypeName().toLowerCase().startsWith( @@ -108,11 +134,11 @@ private static Type convertType(final String name, final TypeInfo typeInfo, throw new IllegalArgumentException("Unknown type: " + typeInfo); } } else if (typeInfo.getCategory().equals(Category.LIST)) { - return convertArrayType(name, (ListTypeInfo) typeInfo); + return convertArrayType(name, (ListTypeInfo) typeInfo, conf); } else if (typeInfo.getCategory().equals(Category.STRUCT)) { - return convertStructType(name, (StructTypeInfo) typeInfo); + return convertStructType(name, (StructTypeInfo) typeInfo, conf); } else if (typeInfo.getCategory().equals(Category.MAP)) { - return convertMapType(name, (MapTypeInfo) typeInfo); + return convertMapType(name, (MapTypeInfo) typeInfo, conf); } else if (typeInfo.getCategory().equals(Category.UNION)) { throw new UnsupportedOperationException("Union type not implemented"); } else { @@ -122,30 +148,32 @@ private static Type convertType(final String name, final TypeInfo typeInfo, // An optional group containing a repeated anonymous group "bag", containing // 1 anonymous element "array_element" - private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) { + private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo, final Configuration conf) { final TypeInfo subType = typeInfo.getListElementTypeInfo(); GroupType groupType = Types.optionalGroup().as(LogicalTypeAnnotation.listType()) - .addField(Types.repeatedGroup().addField(convertType("array_element", subType)) + .addField(Types.repeatedGroup().addField(convertType("array_element", subType, conf)) .named(ParquetHiveSerDe.ARRAY.toString())) .named(name); return groupType; } // An optional group containing multiple elements - private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo) { + private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo, + final Configuration conf) { final List columnNames = typeInfo.getAllStructFieldNames(); final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); - GroupType groupType = Types.optionalGroup().addFields(convertTypes(columnNames, columnTypes)).named(name); + GroupType groupType = Types.optionalGroup().addFields(convertTypes(columnNames, columnTypes, conf)).named(name); return groupType; } // An optional group containing a repeated anonymous group "map", containing // 2 elements: "key", "value" - private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo) { + private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo, + final Configuration conf) { final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), - typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED); + typeInfo.getMapKeyTypeInfo(), conf, Repetition.REQUIRED); final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), - typeInfo.getMapValueTypeInfo()); + typeInfo.getMapValueTypeInfo(), conf); return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, valueType); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/ParquetTimestampUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/ParquetTimestampUtils.java index 9ce07e74f3..53684767b1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/ParquetTimestampUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/ParquetTimestampUtils.java @@ -13,6 +13,7 @@ */ package org.apache.hadoop.hive.ql.io.parquet.timestamp; +import java.math.BigInteger; import java.time.ZoneId; import java.time.ZoneOffset; @@ -53,4 +54,30 @@ public static Timestamp getTimestamp(long value, TimeUnit timeUnit, boolean isAd } return Timestamp.ofEpochSecond(seconds, nanoseconds, zone); } + + /** + * Convert Timestamp to Long, interpreted by TimeUnit. + * Returns null if nanoseconds outside of range 1677-09-21 00:12:43.145224192 - + * 2262-04-11 23:47:16.854775807 as this would cause overflow. + */ + public static Long getInt64(Timestamp timestamp, TimeUnit timeUnit) { + switch (timeUnit) { + case NANOS: + try { + BigInteger nanos = BigInteger.valueOf(timestamp.toEpochSecond()) + .multiply(BigInteger.valueOf(NANO)) + .add(BigInteger.valueOf(timestamp.getNanos())); + return nanos.longValueExact(); + } catch (ArithmeticException e) { + return null; + } + case MICROS: + long secondsInMilli = timestamp.toEpochSecond() * MICRO; + return secondsInMilli + timestamp.getNanos() / MILLI; + case MILLIS: + return timestamp.toEpochMilli(); + default: + throw new IllegalArgumentException("Time unit not recognized"); + } + } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java index f4212f4481..d3b19825f8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java @@ -41,6 +41,7 @@ private DataWritableWriter writer; private MessageType schema; private boolean defaultDateProleptic; + private Configuration conf; public static void setSchema(final MessageType schema, final Configuration configuration) { configuration.set(PARQUET_HIVE_SCHEMA, schema.toString()); @@ -52,6 +53,7 @@ public static MessageType getSchema(final Configuration configuration) { @Override public WriteContext init(final Configuration configuration) { + conf = configuration; schema = getSchema(configuration); Map metaData = new HashMap<>(); metaData.put(WRITER_TIMEZONE, TimeZone.getDefault().toZoneId().toString()); @@ -63,7 +65,7 @@ public WriteContext init(final Configuration configuration) { @Override public void prepareForWrite(final RecordConsumer recordConsumer) { - writer = new DataWritableWriter(recordConsumer, schema, defaultDateProleptic); + writer = new DataWritableWriter(recordConsumer, schema, defaultDateProleptic, conf); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java index 1834008c67..41406aaf34 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java @@ -13,12 +13,15 @@ */ package org.apache.hadoop.hive.ql.io.parquet.write; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; import org.apache.hadoop.hive.common.type.CalendarUtils; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.ParquetTimestampUtils; import org.apache.hadoop.hive.serde2.io.DateWritableV2; import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -66,6 +69,7 @@ protected final RecordConsumer recordConsumer; private final GroupType schema; private final boolean defaultDateProleptic; + private Configuration conf; /* This writer will be created when writing the first row in order to get information about how to inspect the record data. */ @@ -78,6 +82,14 @@ public DataWritableWriter(final RecordConsumer recordConsumer, final GroupType s this.defaultDateProleptic = defaultDateProleptic; } + public DataWritableWriter(final RecordConsumer recordConsumer, final GroupType schema, + final boolean defaultDateProleptic, final Configuration conf) { + this.recordConsumer = recordConsumer; + this.schema = schema; + this.defaultDateProleptic = defaultDateProleptic; + this.conf = conf; + } + /** * It writes a record to Parquet. * @param record Contains the record that is going to be written. @@ -201,10 +213,10 @@ public void write(Object value) { for (int i = 0; i < structFields.size(); i++) { StructField field = structFields.get(i); Object fieldValue = inspector.getStructFieldData(value, field); + DataWriter writer = structWriters[i]; - if (fieldValue != null) { + if (fieldValue != null && isValidValue(fieldValue, writer)) { String fieldName = field.getFieldName(); - DataWriter writer = structWriters[i]; recordConsumer.startField(fieldName, i); writer.write(fieldValue); @@ -212,6 +224,13 @@ public void write(Object value) { } } } + + private boolean isValidValue(Object fieldValue, DataWriter writer) { + if (writer instanceof TimestampDataWriter) { + return ((TimestampDataWriter) writer).isValidTimestamp(fieldValue); + } + return true; + } } private class MessageDataWriter extends GroupDataWriter implements DataWriter { @@ -494,15 +513,40 @@ public void write(Object value) { private class TimestampDataWriter implements DataWriter { private TimestampObjectInspector inspector; + boolean useInt64; + LogicalTypeAnnotation.TimeUnit timeUnit; public TimestampDataWriter(TimestampObjectInspector inspector) { this.inspector = inspector; + String timeUnitVal; + if (conf != null) { + useInt64 = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP); + timeUnitVal = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_TIME_UNIT); + } else { //use defaults + useInt64 = HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.defaultBoolVal; + timeUnitVal = HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_TIME_UNIT.defaultStrVal; + } + timeUnit = LogicalTypeAnnotation.TimeUnit.valueOf(timeUnitVal.toUpperCase()); } @Override public void write(Object value) { Timestamp ts = inspector.getPrimitiveJavaObject(value); - recordConsumer.addBinary(NanoTimeUtils.getNanoTime(ts, false).toBinary()); + if (useInt64) { + Long int64value = ParquetTimestampUtils.getInt64(ts, timeUnit); + recordConsumer.addLong(int64value); + } else { + recordConsumer.addBinary(NanoTimeUtils.getNanoTime(ts, false).toBinary()); + } + } + + boolean isValidTimestamp(Object fieldValue) { + // only check if int64 time unit is nanos + if (useInt64 && timeUnit == LogicalTypeAnnotation.TimeUnit.NANOS) { + Timestamp ts = inspector.getPrimitiveJavaObject(fieldValue); + return ParquetTimestampUtils.getInt64(ts, timeUnit) != null; + } + return true; } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java index 181894f106..0d23d82f33 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java @@ -13,6 +13,7 @@ */ package org.apache.hadoop.hive.ql.io.parquet; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -56,13 +57,25 @@ return columnTypes; } + /** + * Only use if Configuration/HiveConf not needed for converting schema. + */ public static void testConversion( - final String columnNamesStr, - final String columnsTypeStr, - final String actualSchema) throws Exception { + final String columnNamesStr, + final String columnsTypeStr, + final String actualSchema) throws Exception { + testConversion(columnNamesStr, columnsTypeStr, actualSchema, null); + } + + public static void testConversion( + final String columnNamesStr, + final String columnsTypeStr, + final String actualSchema, + final Configuration conf) throws Exception { final List columnNames = createHiveColumnsFrom(columnNamesStr); final List columnTypes = createHiveTypeInfoFrom(columnsTypeStr); - final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); + final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes, + conf); final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema); assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound); @@ -79,18 +92,19 @@ public static void testConversion( } public static void testLogicalTypeAnnotation(String hiveColumnType, String hiveColumnName, - LogicalTypeAnnotation expectedLogicalType) throws Exception { + LogicalTypeAnnotation expectedLogicalType, Configuration conf) throws Exception { Map expectedLogicalTypeForColumn = new HashMap<>(); expectedLogicalTypeForColumn.put(hiveColumnName, expectedLogicalType); - testLogicalTypeAnnotations(hiveColumnName, hiveColumnType, expectedLogicalTypeForColumn); + testLogicalTypeAnnotations(hiveColumnName, hiveColumnType, expectedLogicalTypeForColumn, conf); } public static void testLogicalTypeAnnotations(final String hiveColumnNames, - final String hiveColumnTypes, final Map expectedLogicalTypes) - throws Exception { + final String hiveColumnTypes, final Map expectedLogicalTypes, + Configuration conf) throws Exception { final List columnNames = createHiveColumnsFrom(hiveColumnNames); final List columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); - final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); + final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes, + conf); List actualFields = messageTypeFound.getFields(); for (Type actualField : actualFields) { LogicalTypeAnnotation expectedLogicalType = expectedLogicalTypes.get(actualField.getName()); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestDataWritableWriter.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestDataWritableWriter.java index 01d9d2e27a..1c255ed1a9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestDataWritableWriter.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestDataWritableWriter.java @@ -14,15 +14,18 @@ package org.apache.hadoop.hive.ql.io.parquet; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter; -import org.apache.hadoop.hive.serde2.io.ByteWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; @@ -56,11 +59,13 @@ public class TestDataWritableWriter { @Mock private RecordConsumer mockRecordConsumer; private InOrder inOrder; + private Configuration conf; @Before public void initMocks() { MockitoAnnotations.initMocks(this); inOrder = inOrder(mockRecordConsumer); + conf = new Configuration(); } private void startMessage() { @@ -84,7 +89,7 @@ private void addInteger(int value) { inOrder.verify(mockRecordConsumer).addInteger(value); } - private void addLong(int value) { + private void addLong(long value) { inOrder.verify(mockRecordConsumer).addLong(value); } @@ -140,6 +145,10 @@ private BytesWritable createString(String value) { return new BytesWritable(value.getBytes(StandardCharsets.UTF_8)); } + private TimestampWritableV2 createTimestamp(String s) { + return new TimestampWritableV2(Timestamp.valueOf(s)); + } + private ArrayWritable createGroup(Writable...values) { return new ArrayWritable(Writable.class, values); } @@ -185,14 +194,14 @@ private ParquetHiveRecord getParquetWritable(String columnNames, String columnTy recordProperties.setProperty("columns.types", columnTypes); ParquetHiveSerDe serDe = new ParquetHiveSerDe(); - SerDeUtils.initializeSerDe(serDe, new Configuration(), recordProperties, null); + SerDeUtils.initializeSerDe(serDe, conf, recordProperties, null); return new ParquetHiveRecord(serDe.deserialize(record), getObjectInspector(columnNames, columnTypes)); } private void writeParquetRecord(String schema, ParquetHiveRecord record) throws SerDeException { MessageType fileSchema = MessageTypeParser.parseMessageType(schema); - DataWritableWriter hiveParquetWriter = new DataWritableWriter(mockRecordConsumer, fileSchema, false); + DataWritableWriter hiveParquetWriter = new DataWritableWriter(mockRecordConsumer, fileSchema, false, conf); hiveParquetWriter.write(record); } @@ -255,6 +264,38 @@ public void testSimpleType() throws Exception { endMessage(); } + @Test + public void testInt64Timestamp() throws Exception { + conf.setBoolean(HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.varname, true); + verifyInt64Timestamp("nan", "1970-01-01 00:00:00.000000001", "nanos"); + verifyInt64Timestamp("mic", "1970-01-01 00:00:00.000001", "micros"); + verifyInt64Timestamp("mil", "1970-01-01 00:00:00.001", "millis"); + } + + private void verifyInt64Timestamp(String columnName, String timestampString, String timeUnit) + throws Exception { + conf.set(HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_TIME_UNIT.varname, timeUnit); + String columnTypes = "timestamp"; + + String fileSchema = "message hive_schema {\n" + + " optional int64 " + columnName + " (TIMESTAMP(" + timeUnit.toUpperCase() + ",false));\n" + + "}\n"; + + ArrayWritable hiveRecord = createGroup( + createTimestamp(timestampString) + ); + + // Write record to Parquet format + writeParquetRecord(fileSchema, getParquetWritable(columnName, columnTypes, hiveRecord)); + + // Verify record was written correctly to Parquet + startMessage(); + startField(columnName, 0); + addLong(1L); + endField(columnName, 0); + endMessage(); + } + @Test public void testStructType() throws Exception { String columnNames = "structCol"; diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java index dc80af1b76..0f3ca6e547 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java @@ -23,6 +23,8 @@ import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -37,6 +39,8 @@ public class TestHiveSchemaConverter { + Configuration conf = new Configuration(); + @Test public void testSimpleType() throws Exception { testConversion( @@ -117,12 +121,14 @@ public void testDateType() throws Exception { @Test public void testTimestampType() throws Exception { + conf.setBoolean(HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.varname, false); testConversion( "a", "timestamp", "message hive_schema {\n" + " optional int96 a;\n" - + "}\n"); + + "}\n", + conf); } @Test @@ -192,6 +198,7 @@ public void testArrayString() throws Exception { @Test public void testArrayTimestamp() throws Exception { + conf.setBoolean(HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.varname, false); testConversion("arrayCol", "array", "message hive_schema {\n" @@ -200,7 +207,8 @@ public void testArrayTimestamp() throws Exception { + " optional int96 array_element;\n" + " }\n" + " }\n" - + "}\n"); + + "}\n", + conf); } @Test @@ -287,13 +295,15 @@ public void testStructStrings() throws Exception { @Test public void testStructTimestamp() throws Exception { + conf.setBoolean(HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.varname, false); testConversion("structCol", "struct", "message hive_schema {\n" + " optional group structCol {\n" + " optional int96 a;\n" + " }\n" - + "}\n"); + + "}\n", + conf); } @Test @@ -392,19 +402,21 @@ public void testMapList() throws Exception { @Test public void testLogicalTypes() throws Exception { - testLogicalTypeAnnotation("string", "a", LogicalTypeAnnotation.stringType()); - testLogicalTypeAnnotation("int", "a", null); - testLogicalTypeAnnotation("smallint", "a", LogicalTypeAnnotation.intType(16, true)); - testLogicalTypeAnnotation("tinyint", "a", LogicalTypeAnnotation.intType(8, true)); - testLogicalTypeAnnotation("bigint", "a", null); - testLogicalTypeAnnotation("double", "a", null); - testLogicalTypeAnnotation("float", "a", null); - testLogicalTypeAnnotation("boolean", "a", null); - testLogicalTypeAnnotation("binary", "a", null); - testLogicalTypeAnnotation("timestamp", "a", null); - testLogicalTypeAnnotation("char(3)", "a", LogicalTypeAnnotation.stringType()); - testLogicalTypeAnnotation("varchar(30)", "a", LogicalTypeAnnotation.stringType()); - testLogicalTypeAnnotation("decimal(7,2)", "a", LogicalTypeAnnotation.decimalType(2, 7)); + conf.setBoolean(HiveConf.ConfVars.HIVE_PARQUET_WRITE_INT64_TIMESTAMP.varname, true); + testLogicalTypeAnnotation("string", "a", LogicalTypeAnnotation.stringType(), conf); + testLogicalTypeAnnotation("int", "a", null, conf); + testLogicalTypeAnnotation("smallint", "a", LogicalTypeAnnotation.intType(16, true), conf); + testLogicalTypeAnnotation("tinyint", "a", LogicalTypeAnnotation.intType(8, true), conf); + testLogicalTypeAnnotation("bigint", "a", null, conf); + testLogicalTypeAnnotation("double", "a", null, conf); + testLogicalTypeAnnotation("float", "a", null, conf); + testLogicalTypeAnnotation("boolean", "a", null, conf); + testLogicalTypeAnnotation("binary", "a", null, conf); + testLogicalTypeAnnotation("timestamp", "a", + LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), conf); + testLogicalTypeAnnotation("char(3)", "a", LogicalTypeAnnotation.stringType(), conf); + testLogicalTypeAnnotation("varchar(30)", "a", LogicalTypeAnnotation.stringType(), conf); + testLogicalTypeAnnotation("decimal(7,2)", "a", LogicalTypeAnnotation.decimalType(2, 7), conf); } @Test diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java index bc5e560629..fd1b076c78 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java @@ -25,8 +25,9 @@ import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.ParquetTimestampUtils; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.junit.Assert; -import junit.framework.TestCase; import org.junit.Test; @@ -291,4 +292,130 @@ public void testConvertTimestampToZone() { Timestamp ts2 = TimestampTZUtil.convertTimestampToZone(ts, US_PACIFIC, NEW_YORK); Assert.assertTrue(Timestamp.valueOf("2018-01-01 03:00:00").equals(ts2)); } + + ///////////// INT64/LogicalTypes Timestamp tests ///////////// + + @Test + public void testInt64ExactValue() { + + // basic value + Timestamp ts1 = Timestamp.valueOf("2011-01-01 00:30:00.000000001"); + long millis = ParquetTimestampUtils.getInt64(ts1, LogicalTypeAnnotation.TimeUnit.MILLIS); + Assert.assertEquals(1293841800_000L, millis); + long micros = ParquetTimestampUtils.getInt64(ts1, LogicalTypeAnnotation.TimeUnit.MICROS); + Assert.assertEquals(1293841800000_000L, micros); + long nanos = ParquetTimestampUtils.getInt64(ts1, LogicalTypeAnnotation.TimeUnit.NANOS); + Assert.assertEquals(1293841800000_000_001L, nanos); + + // test correct conversion to time units + Timestamp ts2 = Timestamp.valueOf("2011-01-01 00:30:00.55555"); + millis = ParquetTimestampUtils.getInt64(ts2, LogicalTypeAnnotation.TimeUnit.MILLIS); + Assert.assertEquals(1293841800_555L, millis); + micros = ParquetTimestampUtils.getInt64(ts2, LogicalTypeAnnotation.TimeUnit.MICROS); + Assert.assertEquals(1293841800_555_550L, micros); + nanos = ParquetTimestampUtils.getInt64(ts2, LogicalTypeAnnotation.TimeUnit.NANOS); + Assert.assertEquals(1293841800_555_550_000L, nanos); + + // test max nanos + Timestamp ts3 = Timestamp.valueOf("2018-12-31 23:59:59.999999999"); + millis = ParquetTimestampUtils.getInt64(ts3, LogicalTypeAnnotation.TimeUnit.MILLIS); + Assert.assertEquals(1546300799_999L, millis); + micros = ParquetTimestampUtils.getInt64(ts3, LogicalTypeAnnotation.TimeUnit.MICROS); + Assert.assertEquals(1546300799_999_999L, micros); + nanos = ParquetTimestampUtils.getInt64(ts3, LogicalTypeAnnotation.TimeUnit.NANOS); + Assert.assertEquals(1546300799_999_999_999L, nanos); + + // test pre-epoch date + Timestamp ts4 = Timestamp.valueOf("1968-01-31 00:30:00.000000001"); + millis = ParquetTimestampUtils.getInt64(ts4, LogicalTypeAnnotation.TimeUnit.MILLIS); + Assert.assertEquals(-60564600_000L, millis); + micros = ParquetTimestampUtils.getInt64(ts4, LogicalTypeAnnotation.TimeUnit.MICROS); + Assert.assertEquals(-60564600000_000L, micros); + nanos = ParquetTimestampUtils.getInt64(ts4, LogicalTypeAnnotation.TimeUnit.NANOS); + Assert.assertEquals(-60564599999_999_999L, nanos); + } + + @Test + public void testLegalInt64TimestampStrings() { + //exercise a broad range of timestamps close to the present. + verifyInt64TimestampValue("2011-01-01 01:01:01.111111111", true); + verifyInt64TimestampValue("2012-02-02 02:02:02.222222222", true); + verifyInt64TimestampValue("2013-03-03 03:03:03.333333333", true); + verifyInt64TimestampValue("2014-04-04 04:04:04.444444444", true); + verifyInt64TimestampValue("2015-05-05 05:05:05.555555555", true); + verifyInt64TimestampValue("2016-06-06 06:06:06.666666666", true); + verifyInt64TimestampValue("2017-07-07 07:07:07.777777777", true); + verifyInt64TimestampValue("2018-08-08 08:08:08.888888888", true); + verifyInt64TimestampValue("2019-09-09 09:09:09.999999999", true); + verifyInt64TimestampValue("2020-10-10 10:10:10.101010101", true); + verifyInt64TimestampValue("2021-11-11 11:11:11.111111111", true); + verifyInt64TimestampValue("2022-12-12 12:12:12.121212121", true); + verifyInt64TimestampValue("2023-01-02 13:13:13.131313131", true); + verifyInt64TimestampValue("2024-02-02 14:14:14.141414141", true); + verifyInt64TimestampValue("2025-03-03 15:15:15.151515151", true); + verifyInt64TimestampValue("2026-04-04 16:16:16.161616161", true); + verifyInt64TimestampValue("2027-05-05 17:17:17.171717171", true); + verifyInt64TimestampValue("2028-06-06 18:18:18.181818181", true); + verifyInt64TimestampValue("2029-07-07 19:19:19.191919191", true); + verifyInt64TimestampValue("2030-08-08 20:20:20.202020202", true); + verifyInt64TimestampValue("2031-09-09 21:21:21.212121212", true); + + //test values around epoch + verifyInt64TimestampValue("1969-12-31 23:59:58.123456789", true); + verifyInt64TimestampValue("1969-12-31 23:59:59.999999999", true); + verifyInt64TimestampValue("1970-01-01 00:00:00.0", true); + verifyInt64TimestampValue("1970-01-01 00:00:00.000000001", true); + + //test min and max values for nano + verifyInt64TimestampValue("1677-09-21 00:12:43.145224192", LogicalTypeAnnotation.TimeUnit.NANOS, true); + verifyInt64TimestampValue("2262-04-11 23:47:16.854775807", LogicalTypeAnnotation.TimeUnit.NANOS, true); + + //test some extreme cases. + verifyInt64TimestampValue("0001-01-01 00:00:00.001001001", LogicalTypeAnnotation.TimeUnit.MILLIS, true); + verifyInt64TimestampValue("0001-01-01 00:00:00.001001001", LogicalTypeAnnotation.TimeUnit.MICROS, true); + verifyInt64TimestampValue("9999-09-09 09:09:09.999999999", LogicalTypeAnnotation.TimeUnit.MILLIS, true); + verifyInt64TimestampValue("9999-09-09 09:09:09.999999999", LogicalTypeAnnotation.TimeUnit.MICROS, true); + } + + /** + * Timestamps 1 nanosecond out of nano range should return null. + */ + @Test + public void testIllegalInt64TimestampStrings() { + verifyInt64TimestampValue("1677-09-21 00:12:43.145224191", LogicalTypeAnnotation.TimeUnit.NANOS, false); + verifyInt64TimestampValue("2262-04-11 23:47:16.854775808", LogicalTypeAnnotation.TimeUnit.NANOS, false); + } + + private void verifyInt64TimestampValue(String tsString, boolean legal) { + verifyInt64TimestampValue(tsString, LogicalTypeAnnotation.TimeUnit.MILLIS, legal); + verifyInt64TimestampValue(tsString, LogicalTypeAnnotation.TimeUnit.MICROS, legal); + verifyInt64TimestampValue(tsString, LogicalTypeAnnotation.TimeUnit.NANOS, legal); + } + + private void verifyInt64TimestampValue(String tsString, LogicalTypeAnnotation.TimeUnit timeUnit, boolean legal) { + Timestamp ts = truncateTimestampString(tsString, timeUnit); + String truncatedTsString = ts.toString(); + Long int64Value = ParquetTimestampUtils.getInt64(ts, timeUnit); + if (legal) { + Timestamp tsFetched = ParquetTimestampUtils.getTimestamp(int64Value, timeUnit, false); + Assert.assertEquals(truncatedTsString, tsFetched.toString()); + } else { + Assert.assertEquals(null, int64Value); + } + } + + private Timestamp truncateTimestampString(String tsString, LogicalTypeAnnotation.TimeUnit timeUnit) { + Timestamp ts = Timestamp.valueOf(tsString); + switch (timeUnit) { + case MILLIS: + ts.setNanos(ts.getNanos() / 1_000_000 * 1_000_000); + break; + case MICROS: + ts.setNanos(ts.getNanos() / 1_000 * 1_000); + break; + default: + break; + } + return ts; + } } diff --git ql/src/test/queries/clientpositive/parquet_int64_timestamp.q ql/src/test/queries/clientpositive/parquet_int64_timestamp.q new file mode 100644 index 0000000000..4de315c93f --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_int64_timestamp.q @@ -0,0 +1,50 @@ +set hive.parquet.write.int64.timestamp=true; +set time zone Asia/Singapore; + +--store timestamps as strings for copying into different schemas +create table parquet_timestamp_staging (i int, s string); + +insert into parquet_timestamp_staging +values +(0, '0001-01-01'), +(1, '1677-09-21 00:12:43.145224192'), +(2, '1969-12-31 23:59:59.99999999999999999'), +(3, '1970-01-01 00:00:00'), +(4, '2013-09-27 01:36:18.000000001'), +(5, '2018-01-02 13:14:15.678999'), +(6, '2262-04-11 23:47:16.854775807'), +(7, '9999-12-31 23:59:59.999999999999'); + +create table parquet_int64_timestamp (i int, ts timestamp) stored as parquet; + +--test nanoseconds read/write +set hive.parquet.timestamp.time.unit=nanos; +insert into parquet_int64_timestamp select i, cast (s as timestamp) from parquet_timestamp_staging; +select * from parquet_int64_timestamp order by i; + +--test microseconds read/write +set hive.parquet.timestamp.time.unit=micros; +insert into parquet_int64_timestamp select i + 10, cast (s as timestamp) from parquet_timestamp_staging; +select * from parquet_int64_timestamp order by i; + +--test milliseconds read/write +set hive.parquet.timestamp.time.unit=millis; +insert into parquet_int64_timestamp select i + 20, cast (s as timestamp) from parquet_timestamp_staging; +select * from parquet_int64_timestamp order by i; + + +--time zone should not affect values, since timestamp is time zone agnostic +set time zone America/Buenos_Aires; + +--test filters +select * from parquet_int64_timestamp where ts > '1969-12-31 23:59:59.9' + and + ts < '1970-01-01 00:00:00.0' order by i; +select * from parquet_int64_timestamp where ts <= '1970-01-01 00:00:00.0' + and + ts >= '1970-01-01 00:00:00.0' order by i; +select * from parquet_int64_timestamp where ts = '1970-01-01 00:00:00.0' order by i; + +select * from parquet_int64_timestamp where ts between cast ('1969-12-31 23:59:59.9' as timestamp) and + cast ('1970-01-01 00:00:00' as timestamp) + order by i; diff --git ql/src/test/queries/clientpositive/parquet_int64_timestamp_int96_compatibility.q ql/src/test/queries/clientpositive/parquet_int64_timestamp_int96_compatibility.q new file mode 100644 index 0000000000..8499529b96 --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_int64_timestamp_int96_compatibility.q @@ -0,0 +1,50 @@ +--store timestamps as strings for copying into different schemas +create table parquet_timestamp_staging_2 (i int, s string); +insert into parquet_timestamp_staging_2 +values +(0, '0001-01-01'), +(1, '1677-09-21 00:12:43.145224192'), +(2, '1969-12-31 23:59:59.99999999999999999'), +(3, '1970-01-01 00:00:00'), +(4, '2013-09-27 01:36:18.000000001'), +(5, '2018-01-02 13:14:15.678999'), +(6, '2262-04-11 23:47:16.854775807'), +(7, '9999-12-31 23:59:59.999999999999'); + + +--make int64 table with microsecond granularity +set hive.parquet.write.int64.timestamp=true; +set hive.parquet.timestamp.time.unit=micros; +create table parquet_int64 (i int, ts timestamp) stored as parquet; +insert into parquet_int64 select i, cast (s as timestamp) from parquet_timestamp_staging_2; + +--make int96 table +set hive.parquet.write.int64.timestamp=false; +create table parquet_int96 (i int, ts timestamp) stored as parquet; +insert into parquet_int96 select i + 10, cast (s as timestamp) from parquet_timestamp_staging_2; + +--join int64 and int96 tables +select parquet_int64.i, parquet_int64.ts, parquet_int96.ts + from parquet_int64 + join parquet_int96 + on parquet_int64.ts = parquet_int96.ts + order by parquet_int64.i; + + +--create table with mixed int64/milli and int96 values +set hive.parquet.write.int64.timestamp=true; +set hive.parquet.timestamp.time.unit=millis; +create table parquet_mixed_timestamp as select * from parquet_int64; +set hive.parquet.write.int64.timestamp=false; +insert into parquet_mixed_timestamp select i + 10, cast (s as timestamp) from parquet_timestamp_staging_2; + +select * from parquet_mixed_timestamp order by i; +select * from parquet_mixed_timestamp where ts > cast ('2200-01-01 00:00:00.00' as timestamp) order by i; +select * from parquet_mixed_timestamp where ts < cast ('1900-12-31 23:59:59.9999999999' as timestamp) order by i; +select count(*) from parquet_mixed_timestamp where ts = cast ('1970-01-01 00:00:00.00' as timestamp); +--join mixed table and int64 table +select * + from parquet_mixed_timestamp + join parquet_int64 + on parquet_mixed_timestamp.ts = parquet_int64.ts + order by parquet_mixed_timestamp.i; diff --git ql/src/test/results/clientpositive/parquet_int64_timestamp.q.out ql/src/test/results/clientpositive/parquet_int64_timestamp.q.out new file mode 100644 index 0000000000..c91be1179e --- /dev/null +++ ql/src/test/results/clientpositive/parquet_int64_timestamp.q.out @@ -0,0 +1,205 @@ +PREHOOK: query: create table parquet_timestamp_staging (i int, s string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_timestamp_staging +POSTHOOK: query: create table parquet_timestamp_staging (i int, s string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_timestamp_staging +PREHOOK: query: insert into parquet_timestamp_staging +values +(0, '0001-01-01'), +(1, '1677-09-21 00:12:43.145224192'), +(2, '1969-12-31 23:59:59.99999999999999999'), +(3, '1970-01-01 00:00:00'), +(4, '2013-09-27 01:36:18.000000001'), +(5, '2018-01-02 13:14:15.678999'), +(6, '2262-04-11 23:47:16.854775807'), +(7, '9999-12-31 23:59:59.999999999999') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parquet_timestamp_staging +POSTHOOK: query: insert into parquet_timestamp_staging +values +(0, '0001-01-01'), +(1, '1677-09-21 00:12:43.145224192'), +(2, '1969-12-31 23:59:59.99999999999999999'), +(3, '1970-01-01 00:00:00'), +(4, '2013-09-27 01:36:18.000000001'), +(5, '2018-01-02 13:14:15.678999'), +(6, '2262-04-11 23:47:16.854775807'), +(7, '9999-12-31 23:59:59.999999999999') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parquet_timestamp_staging +POSTHOOK: Lineage: parquet_timestamp_staging.i SCRIPT [] +POSTHOOK: Lineage: parquet_timestamp_staging.s SCRIPT [] +PREHOOK: query: create table parquet_int64_timestamp (i int, ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: query: create table parquet_int64_timestamp (i int, ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_int64_timestamp +PREHOOK: query: insert into parquet_int64_timestamp select i, cast (s as timestamp) from parquet_timestamp_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_timestamp_staging +PREHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: query: insert into parquet_int64_timestamp select i, cast (s as timestamp) from parquet_timestamp_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_timestamp_staging +POSTHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: Lineage: parquet_int64_timestamp.i SIMPLE [(parquet_timestamp_staging)parquet_timestamp_staging.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_int64_timestamp.ts EXPRESSION [(parquet_timestamp_staging)parquet_timestamp_staging.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: select * from parquet_int64_timestamp order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +0 NULL +1 1677-09-21 00:12:43.145224192 +2 1969-12-31 23:59:59.999999999 +3 1970-01-01 00:00:00 +4 2013-09-27 01:36:18.000000001 +5 2018-01-02 13:14:15.678999 +6 2262-04-11 23:47:16.854775807 +7 NULL +PREHOOK: query: insert into parquet_int64_timestamp select i + 10, cast (s as timestamp) from parquet_timestamp_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_timestamp_staging +PREHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: query: insert into parquet_int64_timestamp select i + 10, cast (s as timestamp) from parquet_timestamp_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_timestamp_staging +POSTHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: Lineage: parquet_int64_timestamp.i EXPRESSION [(parquet_timestamp_staging)parquet_timestamp_staging.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_int64_timestamp.ts EXPRESSION [(parquet_timestamp_staging)parquet_timestamp_staging.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: select * from parquet_int64_timestamp order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +0 NULL +1 1677-09-21 00:12:43.145224192 +2 1969-12-31 23:59:59.999999999 +3 1970-01-01 00:00:00 +4 2013-09-27 01:36:18.000000001 +5 2018-01-02 13:14:15.678999 +6 2262-04-11 23:47:16.854775807 +7 NULL +10 0001-01-01 00:00:00 +11 1677-09-21 00:12:43.145224 +12 1969-12-31 23:59:59.999999 +13 1970-01-01 00:00:00 +14 2013-09-27 01:36:18 +15 2018-01-02 13:14:15.678999 +16 2262-04-11 23:47:16.854775 +17 9999-12-31 23:59:59.999999 +PREHOOK: query: insert into parquet_int64_timestamp select i + 20, cast (s as timestamp) from parquet_timestamp_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_timestamp_staging +PREHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: query: insert into parquet_int64_timestamp select i + 20, cast (s as timestamp) from parquet_timestamp_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_timestamp_staging +POSTHOOK: Output: default@parquet_int64_timestamp +POSTHOOK: Lineage: parquet_int64_timestamp.i EXPRESSION [(parquet_timestamp_staging)parquet_timestamp_staging.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_int64_timestamp.ts EXPRESSION [(parquet_timestamp_staging)parquet_timestamp_staging.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: select * from parquet_int64_timestamp order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +0 NULL +1 1677-09-21 00:12:43.145224192 +2 1969-12-31 23:59:59.999999999 +3 1970-01-01 00:00:00 +4 2013-09-27 01:36:18.000000001 +5 2018-01-02 13:14:15.678999 +6 2262-04-11 23:47:16.854775807 +7 NULL +10 0001-01-01 00:00:00 +11 1677-09-21 00:12:43.145224 +12 1969-12-31 23:59:59.999999 +13 1970-01-01 00:00:00 +14 2013-09-27 01:36:18 +15 2018-01-02 13:14:15.678999 +16 2262-04-11 23:47:16.854775 +17 9999-12-31 23:59:59.999999 +20 0001-01-01 00:00:00 +21 1677-09-21 00:12:43.145 +22 1969-12-31 23:59:59.999 +23 1970-01-01 00:00:00 +24 2013-09-27 01:36:18 +25 2018-01-02 13:14:15.678 +26 2262-04-11 23:47:16.854 +27 9999-12-31 23:59:59.999 +PREHOOK: query: select * from parquet_int64_timestamp where ts > '1969-12-31 23:59:59.9' + and + ts < '1970-01-01 00:00:00.0' order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp where ts > '1969-12-31 23:59:59.9' + and + ts < '1970-01-01 00:00:00.0' order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +2 1969-12-31 23:59:59.999999999 +12 1969-12-31 23:59:59.999999 +22 1969-12-31 23:59:59.999 +PREHOOK: query: select * from parquet_int64_timestamp where ts <= '1970-01-01 00:00:00.0' + and + ts >= '1970-01-01 00:00:00.0' order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp where ts <= '1970-01-01 00:00:00.0' + and + ts >= '1970-01-01 00:00:00.0' order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +3 1970-01-01 00:00:00 +13 1970-01-01 00:00:00 +23 1970-01-01 00:00:00 +PREHOOK: query: select * from parquet_int64_timestamp where ts = '1970-01-01 00:00:00.0' order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp where ts = '1970-01-01 00:00:00.0' order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +3 1970-01-01 00:00:00 +13 1970-01-01 00:00:00 +23 1970-01-01 00:00:00 +PREHOOK: query: select * from parquet_int64_timestamp where ts between cast ('1969-12-31 23:59:59.9' as timestamp) and + cast ('1970-01-01 00:00:00' as timestamp) + order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_int64_timestamp where ts between cast ('1969-12-31 23:59:59.9' as timestamp) and + cast ('1970-01-01 00:00:00' as timestamp) + order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64_timestamp +#### A masked pattern was here #### +2 1969-12-31 23:59:59.999999999 +3 1970-01-01 00:00:00 +12 1969-12-31 23:59:59.999999 +13 1970-01-01 00:00:00 +22 1969-12-31 23:59:59.999 +23 1970-01-01 00:00:00 diff --git ql/src/test/results/clientpositive/parquet_int64_timestamp_int96_compatibility.q.out ql/src/test/results/clientpositive/parquet_int64_timestamp_int96_compatibility.q.out new file mode 100644 index 0000000000..72fc69a1c8 --- /dev/null +++ ql/src/test/results/clientpositive/parquet_int64_timestamp_int96_compatibility.q.out @@ -0,0 +1,201 @@ +PREHOOK: query: create table parquet_timestamp_staging_2 (i int, s string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_timestamp_staging_2 +POSTHOOK: query: create table parquet_timestamp_staging_2 (i int, s string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_timestamp_staging_2 +PREHOOK: query: insert into parquet_timestamp_staging_2 +values +(0, '0001-01-01'), +(1, '1677-09-21 00:12:43.145224192'), +(2, '1969-12-31 23:59:59.99999999999999999'), +(3, '1970-01-01 00:00:00'), +(4, '2013-09-27 01:36:18.000000001'), +(5, '2018-01-02 13:14:15.678999'), +(6, '2262-04-11 23:47:16.854775807'), +(7, '9999-12-31 23:59:59.999999999999') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parquet_timestamp_staging_2 +POSTHOOK: query: insert into parquet_timestamp_staging_2 +values +(0, '0001-01-01'), +(1, '1677-09-21 00:12:43.145224192'), +(2, '1969-12-31 23:59:59.99999999999999999'), +(3, '1970-01-01 00:00:00'), +(4, '2013-09-27 01:36:18.000000001'), +(5, '2018-01-02 13:14:15.678999'), +(6, '2262-04-11 23:47:16.854775807'), +(7, '9999-12-31 23:59:59.999999999999') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parquet_timestamp_staging_2 +POSTHOOK: Lineage: parquet_timestamp_staging_2.i SCRIPT [] +POSTHOOK: Lineage: parquet_timestamp_staging_2.s SCRIPT [] +PREHOOK: query: create table parquet_int64 (i int, ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_int64 +POSTHOOK: query: create table parquet_int64 (i int, ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_int64 +PREHOOK: query: insert into parquet_int64 select i, cast (s as timestamp) from parquet_timestamp_staging_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_timestamp_staging_2 +PREHOOK: Output: default@parquet_int64 +POSTHOOK: query: insert into parquet_int64 select i, cast (s as timestamp) from parquet_timestamp_staging_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_timestamp_staging_2 +POSTHOOK: Output: default@parquet_int64 +POSTHOOK: Lineage: parquet_int64.i SIMPLE [(parquet_timestamp_staging_2)parquet_timestamp_staging_2.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_int64.ts EXPRESSION [(parquet_timestamp_staging_2)parquet_timestamp_staging_2.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: create table parquet_int96 (i int, ts timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_int96 +POSTHOOK: query: create table parquet_int96 (i int, ts timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_int96 +PREHOOK: query: insert into parquet_int96 select i + 10, cast (s as timestamp) from parquet_timestamp_staging_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_timestamp_staging_2 +PREHOOK: Output: default@parquet_int96 +POSTHOOK: query: insert into parquet_int96 select i + 10, cast (s as timestamp) from parquet_timestamp_staging_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_timestamp_staging_2 +POSTHOOK: Output: default@parquet_int96 +POSTHOOK: Lineage: parquet_int96.i EXPRESSION [(parquet_timestamp_staging_2)parquet_timestamp_staging_2.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_int96.ts EXPRESSION [(parquet_timestamp_staging_2)parquet_timestamp_staging_2.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: select parquet_int64.i, parquet_int64.ts, parquet_int96.ts + from parquet_int64 + join parquet_int96 + on parquet_int64.ts = parquet_int96.ts + order by parquet_int64.i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64 +PREHOOK: Input: default@parquet_int96 +#### A masked pattern was here #### +POSTHOOK: query: select parquet_int64.i, parquet_int64.ts, parquet_int96.ts + from parquet_int64 + join parquet_int96 + on parquet_int64.ts = parquet_int96.ts + order by parquet_int64.i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64 +POSTHOOK: Input: default@parquet_int96 +#### A masked pattern was here #### +0 0001-01-01 00:00:00 0001-01-01 00:00:00 +3 1970-01-01 00:00:00 1970-01-01 00:00:00 +5 2018-01-02 13:14:15.678999 2018-01-02 13:14:15.678999 +PREHOOK: query: create table parquet_mixed_timestamp as select * from parquet_int64 +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@parquet_int64 +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_mixed_timestamp +POSTHOOK: query: create table parquet_mixed_timestamp as select * from parquet_int64 +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@parquet_int64 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_mixed_timestamp +POSTHOOK: Lineage: parquet_mixed_timestamp.i SIMPLE [(parquet_int64)parquet_int64.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_mixed_timestamp.ts SIMPLE [(parquet_int64)parquet_int64.FieldSchema(name:ts, type:timestamp, comment:null), ] +PREHOOK: query: insert into parquet_mixed_timestamp select i + 10, cast (s as timestamp) from parquet_timestamp_staging_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_timestamp_staging_2 +PREHOOK: Output: default@parquet_mixed_timestamp +POSTHOOK: query: insert into parquet_mixed_timestamp select i + 10, cast (s as timestamp) from parquet_timestamp_staging_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_timestamp_staging_2 +POSTHOOK: Output: default@parquet_mixed_timestamp +POSTHOOK: Lineage: parquet_mixed_timestamp.i EXPRESSION [(parquet_timestamp_staging_2)parquet_timestamp_staging_2.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_mixed_timestamp.ts EXPRESSION [(parquet_timestamp_staging_2)parquet_timestamp_staging_2.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: select * from parquet_mixed_timestamp order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_mixed_timestamp order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +0 0001-01-01 00:00:00 +1 1677-09-21 00:12:43.145224 +2 1969-12-31 23:59:59.999999 +3 1970-01-01 00:00:00 +4 2013-09-27 01:36:18 +5 2018-01-02 13:14:15.678999 +6 2262-04-11 23:47:16.854775 +7 9999-12-31 23:59:59.999999 +10 0001-01-01 00:00:00 +11 1677-09-21 00:12:43.145224192 +12 1969-12-31 23:59:59.999999999 +13 1970-01-01 00:00:00 +14 2013-09-27 01:36:18.000000001 +15 2018-01-02 13:14:15.678999 +16 2262-04-11 23:47:16.854775807 +17 9999-12-31 23:59:59.999999999 +PREHOOK: query: select * from parquet_mixed_timestamp where ts > cast ('2200-01-01 00:00:00.00' as timestamp) order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_mixed_timestamp where ts > cast ('2200-01-01 00:00:00.00' as timestamp) order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +6 2262-04-11 23:47:16.854775 +7 9999-12-31 23:59:59.999999 +16 2262-04-11 23:47:16.854775807 +17 9999-12-31 23:59:59.999999999 +PREHOOK: query: select * from parquet_mixed_timestamp where ts < cast ('1900-12-31 23:59:59.9999999999' as timestamp) order by i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_mixed_timestamp where ts < cast ('1900-12-31 23:59:59.9999999999' as timestamp) order by i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +0 0001-01-01 00:00:00 +1 1677-09-21 00:12:43.145224 +10 0001-01-01 00:00:00 +11 1677-09-21 00:12:43.145224192 +PREHOOK: query: select count(*) from parquet_mixed_timestamp where ts = cast ('1970-01-01 00:00:00.00' as timestamp) +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_mixed_timestamp where ts = cast ('1970-01-01 00:00:00.00' as timestamp) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +2 +PREHOOK: query: select * + from parquet_mixed_timestamp + join parquet_int64 + on parquet_mixed_timestamp.ts = parquet_int64.ts + order by parquet_mixed_timestamp.i +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_int64 +PREHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +POSTHOOK: query: select * + from parquet_mixed_timestamp + join parquet_int64 + on parquet_mixed_timestamp.ts = parquet_int64.ts + order by parquet_mixed_timestamp.i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_int64 +POSTHOOK: Input: default@parquet_mixed_timestamp +#### A masked pattern was here #### +0 0001-01-01 00:00:00 0 0001-01-01 00:00:00 +1 1677-09-21 00:12:43.145224 1 1677-09-21 00:12:43.145224 +2 1969-12-31 23:59:59.999999 2 1969-12-31 23:59:59.999999 +3 1970-01-01 00:00:00 3 1970-01-01 00:00:00 +4 2013-09-27 01:36:18 4 2013-09-27 01:36:18 +5 2018-01-02 13:14:15.678999 5 2018-01-02 13:14:15.678999 +6 2262-04-11 23:47:16.854775 6 2262-04-11 23:47:16.854775 +7 9999-12-31 23:59:59.999999 7 9999-12-31 23:59:59.999999 +10 0001-01-01 00:00:00 0 0001-01-01 00:00:00 +13 1970-01-01 00:00:00 3 1970-01-01 00:00:00 +15 2018-01-02 13:14:15.678999 5 2018-01-02 13:14:15.678999