diff --git pom.xml pom.xml index 240472a30e..c7547decb0 100644 --- pom.xml +++ pom.xml @@ -192,7 +192,7 @@ 2.0.0-M5 4.1.17.Final 3.10.5.Final - 1.10.0 + 1.11.0 0.16.0 1.5.6 2.5.0 @@ -234,6 +234,12 @@ false + + + parquet + parquet staging repository + http://repository.apache.org/content/groups/staging + glassfish-repository http://maven.glassfish.org/content/groups/glassfish diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java index 9010ac36cd..197a63de97 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -16,6 +16,7 @@ import java.math.BigDecimal; import java.util.ArrayList; import java.util.Map; +import java.util.Optional; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.conf.HiveConf; @@ -41,7 +42,11 @@ import org.apache.parquet.column.Dictionary; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; +import org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveType; /** @@ -337,10 +342,7 @@ public void addInt(final int value) { return new PrimitiveConverter() { @Override public void addInt(final int value) { - if (value >= ((OriginalType.UINT_8 == type.getOriginalType() || - OriginalType.UINT_16 == type.getOriginalType() || - OriginalType.UINT_32 == type.getOriginalType() || - OriginalType.UINT_64 == type.getOriginalType()) ? 0 : + if (value >= ((ETypeConverter.isUnsignedInteger(type)) ? 0 : Integer.MIN_VALUE)) { parent.set(index, new IntWritable(value)); } else { @@ -444,10 +446,7 @@ public void addLong(final long value) { return new PrimitiveConverter() { @Override public void addLong(final long value) { - if (value >= ((OriginalType.UINT_8 == type.getOriginalType() || - OriginalType.UINT_16 == type.getOriginalType() || - OriginalType.UINT_32 == type.getOriginalType() || - OriginalType.UINT_64 == type.getOriginalType()) ? 0 : Long.MIN_VALUE)) { + if (value >= ((ETypeConverter.isUnsignedInteger(type)) ? 0 : Long.MIN_VALUE)) { parent.set(index, new LongWritable(value)); } else { parent.set(index, null); @@ -586,7 +585,9 @@ protected HiveDecimalWritable convert(Binary binary) { return new BinaryConverter(type, parent, index) { @Override protected HiveDecimalWritable convert(Binary binary) { - return new HiveDecimalWritable(binary.getBytes(), type.getDecimalMetadata().getScale()); + DecimalLogicalTypeAnnotation logicalType = + (DecimalLogicalTypeAnnotation) type.getLogicalTypeAnnotation(); + return new HiveDecimalWritable(binary.getBytes(), logicalType.getScale()); } }; } @@ -634,17 +635,33 @@ private ETypeConverter(final Class type) { abstract PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo); public static PrimitiveConverter getNewConverter(final PrimitiveType type, final int index, - final ConverterParent parent, TypeInfo hiveTypeInfo) { + final ConverterParent parent, final TypeInfo hiveTypeInfo) { if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) { //TODO- cleanup once parquet support Timestamp type annotation. return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo); } - if (OriginalType.DECIMAL == type.getOriginalType()) { - return EDECIMAL_CONVERTER.getConverter(type, index, parent, hiveTypeInfo); - } else if (OriginalType.UTF8 == type.getOriginalType()) { - return ESTRING_CONVERTER.getConverter(type, index, parent, hiveTypeInfo); - } else if (OriginalType.DATE == type.getOriginalType()) { - return EDATE_CONVERTER.getConverter(type, index, parent, hiveTypeInfo); + if (type.getLogicalTypeAnnotation() != null) { + Optional converter = type.getLogicalTypeAnnotation() + .accept(new LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(DecimalLogicalTypeAnnotation logicalTypeAnnotation) { + return Optional.of(EDECIMAL_CONVERTER.getConverter(type, index, parent, hiveTypeInfo)); + } + + @Override + public Optional visit(StringLogicalTypeAnnotation logicalTypeAnnotation) { + return Optional.of(ESTRING_CONVERTER.getConverter(type, index, parent, hiveTypeInfo)); + } + + @Override + public Optional visit(DateLogicalTypeAnnotation logicalTypeAnnotation) { + return Optional.of(EDATE_CONVERTER.getConverter(type, index, parent, hiveTypeInfo)); + } + }); + + if (converter.isPresent()) { + return converter.get(); + } } Class javaType = type.getPrimitiveTypeName().javaType; @@ -657,11 +674,24 @@ public static PrimitiveConverter getNewConverter(final PrimitiveType type, final throw new IllegalArgumentException("Converter not found ... for type : " + type); } + public static boolean isUnsignedInteger(final PrimitiveType type) { + if (type.getLogicalTypeAnnotation() != null) { + Optional isUnsignedInteger = type.getLogicalTypeAnnotation() + .accept(new LogicalTypeAnnotationVisitor() { + @Override public Optional visit( + LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + return Optional.of(!intLogicalType.isSigned()); + } + }); + if (isUnsignedInteger.isPresent()) { + return isUnsignedInteger.get(); + } + } + return false; + } + private static long getMinValue(final PrimitiveType type, String typeName, long defaultValue) { - if (OriginalType.UINT_8 == type.getOriginalType() || - OriginalType.UINT_16 == type.getOriginalType() || - OriginalType.UINT_32 == type.getOriginalType() || - OriginalType.UINT_64 == type.getOriginalType()) { + if(isUnsignedInteger(type)) { return 0; } else { switch (typeName) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java index c0124575ea..a13a549e12 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java @@ -19,11 +19,15 @@ import org.apache.parquet.io.api.GroupConverter; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; +import org.apache.parquet.schema.LogicalTypeAnnotation.MapKeyValueTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; import java.util.Map; +import java.util.Optional; public abstract class HiveGroupConverter extends GroupConverter implements ConverterParent { @@ -46,17 +50,34 @@ protected static PrimitiveConverter getConverterFromDescription(PrimitiveType ty return ETypeConverter.getNewConverter(type, index, parent, hiveTypeInfo); } - protected static HiveGroupConverter getConverterFromDescription(GroupType type, int index, ConverterParent parent, - TypeInfo hiveTypeInfo) { + protected static HiveGroupConverter getConverterFromDescription(final GroupType type, + final int index, final ConverterParent parent, final TypeInfo hiveTypeInfo) { if (type == null) { return null; } - OriginalType annotation = type.getOriginalType(); - if (annotation == OriginalType.LIST) { - return HiveCollectionConverter.forList(type, parent, index, hiveTypeInfo); - } else if (annotation == OriginalType.MAP || annotation == OriginalType.MAP_KEY_VALUE) { - return HiveCollectionConverter.forMap(type, parent, index, hiveTypeInfo); + if (type.getLogicalTypeAnnotation() != null) { + Optional converter = + type.getLogicalTypeAnnotation().accept(new LogicalTypeAnnotationVisitor(){ + @Override + public Optional visit(ListLogicalTypeAnnotation logicalTypeAnnotation) { + return Optional.of(HiveCollectionConverter.forList(type, parent, index, hiveTypeInfo)); + } + + @Override + public Optional visit(MapLogicalTypeAnnotation logicalTypeAnnotation) { + return Optional.of(HiveCollectionConverter.forMap(type, parent, index, hiveTypeInfo)); + } + + @Override + public Optional visit(MapKeyValueTypeAnnotation logicalTypeAnnotation) { + return Optional.of(HiveCollectionConverter.forMap(type, parent, index, hiveTypeInfo)); + } + }); + + if (converter.isPresent()) { + return converter.get(); + } } return new HiveStructConverter(type, parent, index, hiveTypeInfo); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java index 302321c7f8..21bfb2e1a2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -26,8 +26,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.parquet.schema.ConversionPatterns; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; @@ -60,16 +60,16 @@ private static Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition) { if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { - return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) - .named(name); + return Types.primitive(PrimitiveTypeName.BINARY, repetition) + .as(LogicalTypeAnnotation.stringType()).named(name); } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.shortTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition) - .as(OriginalType.INT_16).named(name); + .as(LogicalTypeAnnotation.intType(16, true)).named(name); } else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition) - .as(OriginalType.INT_8).named(name); + .as(LogicalTypeAnnotation.intType(8, true)).named(name); } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { @@ -86,22 +86,22 @@ private static Type convertType(final String name, final TypeInfo typeInfo, throw new UnsupportedOperationException("Void type not implemented"); } else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.CHAR_TYPE_NAME)) { - return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) + return Types.optional(PrimitiveTypeName.BINARY).as(LogicalTypeAnnotation.stringType()) .named(name); } else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.VARCHAR_TYPE_NAME)) { - return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) + return Types.optional(PrimitiveTypeName.BINARY).as(LogicalTypeAnnotation.stringType()) .named(name); } else if (typeInfo instanceof DecimalTypeInfo) { DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; int prec = decimalTypeInfo.precision(); int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; - return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL). - scale(scale).precision(prec).named(name); + return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes) + .as(LogicalTypeAnnotation.decimalType(scale, prec)).named(name); } else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) { - return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named - (name); + return Types.primitive(PrimitiveTypeName.INT32, repetition) + .as(LogicalTypeAnnotation.dateType()).named(name); } else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) { throw new UnsupportedOperationException("Unknown type not implemented"); } else { @@ -122,19 +122,21 @@ private static Type convertType(final String name, final TypeInfo typeInfo, // An optional group containing a repeated anonymous group "bag", containing // 1 anonymous element "array_element" - @SuppressWarnings("deprecation") private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) { final TypeInfo subType = typeInfo.getListElementTypeInfo(); - return new GroupType(Repetition.OPTIONAL, name, OriginalType.LIST, new GroupType(Repetition.REPEATED, - ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType))); + GroupType groupType = Types.optionalGroup().as(LogicalTypeAnnotation.listType()) + .addField(Types.repeatedGroup().addField(convertType("array_element", subType)) + .named(ParquetHiveSerDe.ARRAY.toString())) + .named(name); + return groupType; } // An optional group containing multiple elements private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo) { final List columnNames = typeInfo.getAllStructFieldNames(); final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); - return new GroupType(Repetition.OPTIONAL, name, convertTypes(columnNames, columnTypes)); - + GroupType groupType = Types.optionalGroup().addFields(convertTypes(columnNames, columnTypes)).named(name); + return groupType; } // An optional group containing a repeated anonymous group "map", containing diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index 7f2a684d28..9c7fd57aec 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -43,8 +43,8 @@ import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.io.api.RecordMaterializer; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; @@ -157,8 +157,8 @@ private static Type getProjectedType(TypeInfo colType, Type fieldType) { } else { subFieldType = getProjectedType(elemType, subFieldType); } - return Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.LIST).addFields( - subFieldType).named(fieldType.getName()); + return Types.buildGroup(Repetition.OPTIONAL).as(LogicalTypeAnnotation.listType()) + .addFields(subFieldType).named(fieldType.getName()); } } break; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java index c1d71337d8..de973d4bc2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java @@ -21,6 +21,7 @@ import org.apache.hadoop.hive.common.type.HiveBaseChar; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.io.parquet.convert.ETypeConverter; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; import org.apache.hadoop.hive.serde.serdeConstants; @@ -37,7 +38,10 @@ import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; +import org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveType; import java.io.IOException; @@ -45,6 +49,7 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; +import java.util.Optional; /** * Parquet file has self-describing schema which may differ from the user required schema (e.g. @@ -1498,10 +1503,7 @@ private static ParquetDataColumnReader getDataColumnReaderByTypeHelper(boolean i switch (parquetType.getPrimitiveTypeName()) { case INT32: - if (OriginalType.UINT_8 == parquetType.getOriginalType() || - OriginalType.UINT_16 == parquetType.getOriginalType() || - OriginalType.UINT_32 == parquetType.getOriginalType() || - OriginalType.UINT_64 == parquetType.getOriginalType()) { + if (ETypeConverter.isUnsignedInteger(parquetType)) { return isDictionary ? new TypesFromUInt32PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromUInt32PageReader(valuesReader, length, hivePrecision, hiveScale); @@ -1511,10 +1513,7 @@ private static ParquetDataColumnReader getDataColumnReaderByTypeHelper(boolean i hiveScale); } case INT64: - if (OriginalType.UINT_8 == parquetType.getOriginalType() || - OriginalType.UINT_16 == parquetType.getOriginalType() || - OriginalType.UINT_32 == parquetType.getOriginalType() || - OriginalType.UINT_64 == parquetType.getOriginalType()) { + if (ETypeConverter.isUnsignedInteger(parquetType)) { return isDictionary ? new TypesFromUInt64PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromUInt64PageReader(valuesReader, length, hivePrecision, hiveScale); @@ -1552,7 +1551,7 @@ private static ParquetDataColumnReader getConvertorFromBinary(boolean isDict, TypeInfo hiveType, ValuesReader valuesReader, Dictionary dictionary) { - OriginalType originalType = parquetType.getOriginalType(); + LogicalTypeAnnotation logicalType = parquetType.getLogicalTypeAnnotation(); // max length for varchar and char cases int length = getVarcharLength(hiveType); @@ -1568,22 +1567,37 @@ private static ParquetDataColumnReader getConvertorFromBinary(boolean isDict, int hiveScale = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getScale() : 0; - if (originalType == null) { - return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new - DefaultParquetDataColumnReader(valuesReader, length); - } - switch (originalType) { - case DECIMAL: - final short scale = (short) parquetType.asPrimitiveType().getDecimalMetadata().getScale(); - return isDict ? new TypesFromDecimalPageReader(dictionary, length, scale, hivePrecision, hiveScale) : new - TypesFromDecimalPageReader(valuesReader, length, scale, hivePrecision, hiveScale); - case UTF8: - return isDict ? new TypesFromStringPageReader(dictionary, length) : new - TypesFromStringPageReader(valuesReader, length); - default: + if (logicalType == null) { return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new DefaultParquetDataColumnReader(valuesReader, length); } + + Optional reader = parquetType.getLogicalTypeAnnotation() + .accept(new LogicalTypeAnnotationVisitor() { + @Override public Optional visit( + DecimalLogicalTypeAnnotation logicalTypeAnnotation) { + final short scale = (short) logicalTypeAnnotation.getScale(); + return isDict ? Optional + .of(new TypesFromDecimalPageReader(dictionary, length, scale, hivePrecision, + hiveScale)) : Optional + .of(new TypesFromDecimalPageReader(valuesReader, length, scale, hivePrecision, + hiveScale)); + } + + @Override public Optional visit( + StringLogicalTypeAnnotation logicalTypeAnnotation) { + return isDict ? Optional + .of(new TypesFromStringPageReader(dictionary, length)) : Optional + .of(new TypesFromStringPageReader(valuesReader, length)); + } + }); + + if (reader.isPresent()) { + return reader.get(); + } + + return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new + DefaultParquetDataColumnReader(valuesReader, length); } public static ParquetDataColumnReader getDataColumnReaderByTypeOnDictionary( diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java index 3d61c33afd..bd519eb66e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java @@ -44,7 +44,9 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.RecordConsumer; import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation; import org.apache.parquet.schema.Type; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -141,12 +143,12 @@ private DataWriter createWriter(ObjectInspector inspector, Type type) { } } else { GroupType groupType = type.asGroupType(); - OriginalType originalType = type.getOriginalType(); + LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation(); - if (originalType != null && originalType.equals(OriginalType.LIST)) { + if (logicalType != null && logicalType instanceof ListLogicalTypeAnnotation) { checkInspectorCategory(inspector, ObjectInspector.Category.LIST); return new ListDataWriter((ListObjectInspector)inspector, groupType); - } else if (originalType != null && originalType.equals(OriginalType.MAP)) { + } else if (logicalType != null && logicalType instanceof MapLogicalTypeAnnotation) { checkInspectorCategory(inspector, ObjectInspector.Category.MAP); return new MapDataWriter((MapObjectInspector)inspector, groupType); } else { diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java index 17eca38111..181894f106 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/HiveParquetSchemaTestUtils.java @@ -18,14 +18,18 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.Type; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; public class HiveParquetSchemaTestUtils { @@ -67,9 +71,37 @@ public static void testConversion( List expectedFields = expectedMT.getFields(); List actualFields = messageTypeFound.getFields(); for (int i = 0, n = expectedFields.size(); i < n; ++i) { - OriginalType exp = expectedFields.get(i).getOriginalType(); - OriginalType act = actualFields.get(i).getOriginalType(); - assertEquals("Original types of the field do not match", exp, act); + + LogicalTypeAnnotation expectedLogicalType = expectedFields.get(i).getLogicalTypeAnnotation(); + LogicalTypeAnnotation actualLogicalType = actualFields.get(i).getLogicalTypeAnnotation(); + assertEquals("Logical type annotations of the field do not match", expectedLogicalType, actualLogicalType); + } + } + + public static void testLogicalTypeAnnotation(String hiveColumnType, String hiveColumnName, + LogicalTypeAnnotation expectedLogicalType) throws Exception { + Map expectedLogicalTypeForColumn = new HashMap<>(); + expectedLogicalTypeForColumn.put(hiveColumnName, expectedLogicalType); + testLogicalTypeAnnotations(hiveColumnName, hiveColumnType, expectedLogicalTypeForColumn); + } + + public static void testLogicalTypeAnnotations(final String hiveColumnNames, + final String hiveColumnTypes, final Map expectedLogicalTypes) + throws Exception { + final List columnNames = createHiveColumnsFrom(hiveColumnNames); + final List columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); + final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); + List actualFields = messageTypeFound.getFields(); + for (Type actualField : actualFields) { + LogicalTypeAnnotation expectedLogicalType = expectedLogicalTypes.get(actualField.getName()); + LogicalTypeAnnotation actualLogicalType = actualField.getLogicalTypeAnnotation(); + if (expectedLogicalType != null) { + assertNotNull("The logical type annotation cannot be null.", actualLogicalType); + assertEquals("Logical type annotations of the field do not match", expectedLogicalType, + actualLogicalType); + } else { + assertNull("The logical type annotation must be null.", actualLogicalType); + } } } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestArrayCompatibility.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestArrayCompatibility.java index f2814f6943..aea0bf9492 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestArrayCompatibility.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestArrayCompatibility.java @@ -28,10 +28,10 @@ import org.junit.Assert; import org.junit.Test; import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Types; -import static org.apache.parquet.schema.OriginalType.LIST; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; @@ -123,7 +123,7 @@ public void write(RecordConsumer rc) { public void testThriftPrimitiveInList() throws Exception { Path test = writeDirect("ThriftPrimitiveInList", Types.buildMessage() - .requiredGroup().as(LIST) + .requiredGroup().as(LogicalTypeAnnotation.listType()) .repeated(INT32).named("list_of_ints_tuple") .named("list_of_ints") .named("ThriftPrimitiveInList"), @@ -163,7 +163,7 @@ public void testThriftSingleFieldGroupInList() throws Exception { Path test = writeDirect("ThriftSingleFieldGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .required(INT64).named("count") .named("single_element_groups_tuple") @@ -212,7 +212,7 @@ public void write(RecordConsumer rc) { public void testAvroPrimitiveInList() throws Exception { Path test = writeDirect("AvroPrimitiveInList", Types.buildMessage() - .requiredGroup().as(LIST) + .requiredGroup().as(LogicalTypeAnnotation.listType()) .repeated(INT32).named("array") .named("list_of_ints") .named("AvroPrimitiveInList"), @@ -252,7 +252,7 @@ public void testAvroSingleFieldGroupInList() throws Exception { Path test = writeDirect("AvroSingleFieldGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .required(INT64).named("count") .named("array") @@ -304,7 +304,7 @@ public void testAmbiguousSingleFieldGroupInList() throws Exception { Path test = writeDirect("SingleFieldGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .required(INT64).named("count") .named("single_element_group") @@ -355,7 +355,7 @@ public void testMultiFieldGroupInList() throws Exception { Path test = writeDirect("MultiFieldGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .required(DOUBLE).named("latitude") .required(DOUBLE).named("longitude") @@ -411,7 +411,7 @@ public void write(RecordConsumer rc) { public void testNewOptionalGroupInList() throws Exception { Path test = writeDirect("NewOptionalGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .optionalGroup() .required(DOUBLE).named("latitude") @@ -488,7 +488,7 @@ public void write(RecordConsumer rc) { public void testNewRequiredGroupInList() throws Exception { Path test = writeDirect("NewRequiredGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .requiredGroup() .required(DOUBLE).named("latitude") @@ -561,7 +561,7 @@ public void testHiveRequiredGroupInList() throws Exception { // this matches the list structure that Hive writes Path test = writeDirect("HiveRequiredGroupInList", Types.buildMessage() - .optionalGroup().as(LIST) + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .requiredGroup() .required(DOUBLE).named("latitude") diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java index e1b2ba1021..dc80af1b76 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java @@ -18,12 +18,19 @@ import static org.apache.hadoop.hive.ql.io.parquet.HiveParquetSchemaTestUtils.testConversion; import static org.junit.Assert.assertEquals; +import static org.apache.hadoop.hive.ql.io.parquet.HiveParquetSchemaTestUtils.testLogicalTypeAnnotation; + +import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageTypeParser; +import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; import org.junit.Test; @@ -33,13 +40,16 @@ @Test public void testSimpleType() throws Exception { testConversion( - "a,b,c,d", - "int,bigint,double,boolean", + "a,b,c,d,e,f,g", + "int,bigint,double,boolean,string,float,binary", "message hive_schema {\n" + " optional int32 a;\n" + " optional int64 b;\n" + " optional double c;\n" + " optional boolean d;\n" + + " optional binary e (UTF8);\n" + + " optional float f;\n" + + " optional binary g;\n" + "}\n"); } @@ -54,6 +64,17 @@ public void testSpecialIntType() throws Exception { + "}\n"); } + @Test + public void testSpecialIntTypeWithLogicatlTypeAnnotations() throws Exception { + testConversion( + "a,b", + "tinyint,smallint", + "message hive_schema {\n" + + " optional int32 a (INTEGER(8,true));\n" + + " optional int32 b (INTEGER(16,true));\n" + + "}\n"); + } + @Test public void testDecimalType() throws Exception { testConversion( @@ -94,6 +115,16 @@ public void testDateType() throws Exception { + "}\n"); } + @Test + public void testTimestampType() throws Exception { + testConversion( + "a", + "timestamp", + "message hive_schema {\n" + + " optional int96 a;\n" + + "}\n"); + } + @Test public void testArray() throws Exception { testConversion("arrayCol", @@ -120,6 +151,99 @@ public void testArrayDecimal() throws Exception { + "}\n"); } + @Test + public void testArrayTinyInt() throws Exception { + testConversion("arrayCol", + "array", + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional int32 array_element (INT_8);\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArraySmallInt() throws Exception { + testConversion("arrayCol", + "array", + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional int32 array_element (INT_16);\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArrayString() throws Exception { + testConversion("arrayCol", + "array", + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional binary array_element (UTF8);\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArrayTimestamp() throws Exception { + testConversion("arrayCol", + "array", + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional int96 array_element;\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArrayStruct() throws Exception { + testConversion("structCol", + "array>", + "message hive_schema {\n" + + " optional group structCol (LIST) {\n" + + " repeated group bag {\n" + + " optional group array_element {\n" + + " optional binary a (UTF8);\n" + + " optional int32 b;\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArrayInArray() throws Exception { + final List columnNames = createHiveColumnsFrom("arrayCol"); + ListTypeInfo listTypeInfo = new ListTypeInfo(); + listTypeInfo.setListElementTypeInfo(TypeInfoUtils.getTypeInfosFromTypeString("int").get(0)); + List typeInfos = new ArrayList<>(); + ListTypeInfo listTypeInfo2 = new ListTypeInfo(); + listTypeInfo2.setListElementTypeInfo(listTypeInfo); + typeInfos.add(listTypeInfo2); + final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, typeInfos); + final MessageType expectedMT = MessageTypeParser.parseMessageType( + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional group array_element (LIST) {\n" + + " repeated group bag {\n" + + " optional int32 array_element;\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"); + assertEquals(expectedMT, messageTypeFound); + } + @Test public void testStruct() throws Exception { testConversion("structCol", @@ -134,6 +258,61 @@ public void testStruct() throws Exception { + "}\n"); } + @Test + public void testStructInts() throws Exception { + testConversion("structCol", + "struct", + "message hive_schema {\n" + + " optional group structCol {\n" + + " optional int32 a (INT_8);\n" + + " optional int32 b (INT_16);\n" + + " optional int32 c;\n" + + " optional int64 d;\n" + + " }\n" + + "}\n"); + } + + @Test + public void testStructStrings() throws Exception { + testConversion("structCol", + "struct", + "message hive_schema {\n" + + " optional group structCol {\n" + + " optional binary a (UTF8);\n" + + " optional binary b (UTF8);\n" + + " optional binary c (UTF8);\n" + + " }\n" + + "}\n"); + } + + @Test + public void testStructTimestamp() throws Exception { + testConversion("structCol", + "struct", + "message hive_schema {\n" + + " optional group structCol {\n" + + " optional int96 a;\n" + + " }\n" + + "}\n"); + } + + @Test + public void testStructList() throws Exception { + testConversion("structCol", + "struct,b:int,c:string>", + "message hive_schema {\n" + + " optional group structCol {\n" + + " optional group a (LIST) {\n" + + " repeated group bag {\n" + + " optional binary array_element (UTF8);\n" + + " }\n" + + " }\n" + + " optional int32 b;\n" + + " optional binary c (UTF8);" + + " }\n" + + "}\n"); + } + @Test public void testMap() throws Exception { testConversion("mapCol", @@ -162,25 +341,131 @@ public void testMapDecimal() throws Exception { + "}\n"); } + @Test + public void testMapInts() throws Exception { + testConversion("mapCol", + "map", + "message hive_schema {\n" + + " optional group mapCol (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required int32 key (INT_16);\n" + + " optional int32 value (INT_8);\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testMapStruct() throws Exception { + testConversion("mapCol", + "map>", + "message hive_schema {\n" + + " optional group mapCol (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " optional group value {\n" + + " optional int32 a (INT_16);\n" + + " optional int32 b;\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testMapList() throws Exception { + testConversion("mapCol", + "map>", + "message hive_schema {\n" + + " optional group mapCol (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " optional group value (LIST) {\n" + + " repeated group bag {\n" + + " optional binary array_element (UTF8);\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testLogicalTypes() throws Exception { + testLogicalTypeAnnotation("string", "a", LogicalTypeAnnotation.stringType()); + testLogicalTypeAnnotation("int", "a", null); + testLogicalTypeAnnotation("smallint", "a", LogicalTypeAnnotation.intType(16, true)); + testLogicalTypeAnnotation("tinyint", "a", LogicalTypeAnnotation.intType(8, true)); + testLogicalTypeAnnotation("bigint", "a", null); + testLogicalTypeAnnotation("double", "a", null); + testLogicalTypeAnnotation("float", "a", null); + testLogicalTypeAnnotation("boolean", "a", null); + testLogicalTypeAnnotation("binary", "a", null); + testLogicalTypeAnnotation("timestamp", "a", null); + testLogicalTypeAnnotation("char(3)", "a", LogicalTypeAnnotation.stringType()); + testLogicalTypeAnnotation("varchar(30)", "a", LogicalTypeAnnotation.stringType()); + testLogicalTypeAnnotation("decimal(7,2)", "a", LogicalTypeAnnotation.decimalType(2, 7)); + } + @Test public void testMapOriginalType() throws Exception { - final String hiveColumnTypes = "map"; - final String hiveColumnNames = "mapCol"; - final List columnNames = createHiveColumnsFrom(hiveColumnNames); - final List columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); - final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); + final MessageType messageTypeFound = createSchema("map", "mapCol"); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); - org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); - assertEquals("mapCol",topLevel.getName()); - assertEquals(OriginalType.MAP, topLevel.getOriginalType()); - assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); + Type topLevel = messageTypeFound.getFields().get(0); + checkField(topLevel, "mapCol", Repetition.OPTIONAL, LogicalTypeAnnotation.mapType()); assertEquals(1, topLevel.asGroupType().getFieldCount()); - org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); - //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; - assertEquals("map", secondLevel.getName()); - assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); - assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); + Type secondLevel = topLevel.asGroupType().getFields().get(0); + // there is one repeated field for mapCol, the field name is "map" and its original Type is + // MAP_KEY_VALUE; + checkField(secondLevel, "map", Repetition.REPEATED, LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()); + } + + @Test + public void testListOriginalType() throws Exception { + + final MessageType messageTypeFound = createSchema("array", "arrayCol"); + + assertEquals(1, messageTypeFound.getFieldCount()); + Type topLevel = messageTypeFound.getFields().get(0); + checkField(topLevel, "arrayCol", Repetition.OPTIONAL, LogicalTypeAnnotation.listType()); + + assertEquals(1, topLevel.asGroupType().getFieldCount()); + Type secondLevel = topLevel.asGroupType().getFields().get(0); + checkField(secondLevel, "bag", Repetition.REPEATED, null); + + assertEquals(1, secondLevel.asGroupType().getFieldCount()); + Type thirdLevel = secondLevel.asGroupType().getFields().get(0); + checkField(thirdLevel, "array_element", Repetition.OPTIONAL, LogicalTypeAnnotation.intType(8, true)); + } + + @Test + public void testStructOriginalType() throws Exception { + + final MessageType messageTypeFound = createSchema("struct", "structCol"); + + assertEquals(1, messageTypeFound.getFieldCount()); + Type topLevel = messageTypeFound.getFields().get(0); + checkField(topLevel, "structCol", Repetition.OPTIONAL, null); + + assertEquals(2, topLevel.asGroupType().getFieldCount()); + Type a = topLevel.asGroupType().getFields().get(0); + checkField(a, "a", Repetition.OPTIONAL, LogicalTypeAnnotation.intType(16, true)); + Type b = topLevel.asGroupType().getFields().get(1); + checkField(b, "b", Repetition.OPTIONAL, LogicalTypeAnnotation.stringType()); + } + + private MessageType createSchema(String hiveColumnTypes, String hiveColumnNames) { + List columnNames = createHiveColumnsFrom(hiveColumnNames); + List columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); + return HiveSchemaConverter.convert(columnNames, columnTypes); + } + + private void checkField(Type field, String expectedName, Repetition expectedRepetition, + LogicalTypeAnnotation expectedLogicalType) { + assertEquals(expectedName, field.getName()); + assertEquals(expectedLogicalType, field.getLogicalTypeAnnotation()); + assertEquals(expectedRepetition, field.getRepetition()); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapStructures.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapStructures.java index 7717f3c418..7de25ddb91 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapStructures.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapStructures.java @@ -29,9 +29,9 @@ import org.junit.Test; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.Types; -import static org.apache.parquet.schema.OriginalType.*; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; public class TestMapStructures extends AbstractTestParquetDirect { @@ -40,9 +40,9 @@ public void testStringMapRequiredPrimitive() throws Exception { Path test = writeDirect("StringMapRequiredPrimitive", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() - .required(BINARY).as(UTF8).named("key") + .required(BINARY).as(LogicalTypeAnnotation.stringType()).named("key") .required(INT32).named("value") .named("key_value") .named("votes") @@ -100,9 +100,9 @@ public void write(RecordConsumer rc) { public void testStringMapOptionalPrimitive() throws Exception { Path test = writeDirect("StringMapOptionalPrimitive", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() - .required(BINARY).as(UTF8).named("key") + .required(BINARY).as(LogicalTypeAnnotation.stringType()).named("key") .optional(INT32).named("value") .named("key_value") .named("votes") @@ -170,12 +170,12 @@ public void testStringMapOfOptionalArray() throws Exception { Path test = writeDirect("StringMapOfOptionalArray", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() - .required(BINARY).as(UTF8).named("key") - .optionalGroup().as(LIST) + .required(BINARY).as(LogicalTypeAnnotation.stringType()).named("key") + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() - .optional(BINARY).as(UTF8).named("element") + .optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("element") .named("list") .named("value") .named("key_value") @@ -250,10 +250,10 @@ public void testStringMapOfOptionalIntArray() throws Exception { Path test = writeDirect("StringMapOfOptionalIntArray", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() - .required(BINARY).as(UTF8).named("key") - .optionalGroup().as(LIST) + .required(BINARY).as(LogicalTypeAnnotation.stringType()).named("key") + .optionalGroup().as(LogicalTypeAnnotation.listType()) .repeatedGroup() .optional(INT32).named("element") .named("list") @@ -343,7 +343,7 @@ public void write(RecordConsumer rc) { public void testMapWithComplexKey() throws Exception { Path test = writeDirect("MapWithComplexKey", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() .requiredGroup() .required(INT32).named("x") @@ -404,7 +404,7 @@ public void write(RecordConsumer rc) { public void testDoubleMapWithStructValue() throws Exception { Path test = writeDirect("DoubleMapWithStructValue", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() .optional(DOUBLE).named("key") .optionalGroup() @@ -465,12 +465,12 @@ public void write(RecordConsumer rc) { public void testNestedMap() throws Exception { Path test = writeDirect("DoubleMapWithStructValue", Types.buildMessage() - .optionalGroup().as(MAP) + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() - .optional(BINARY).as(UTF8).named("key") - .optionalGroup().as(MAP) + .optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("key") + .optionalGroup().as(LogicalTypeAnnotation.mapType()) .repeatedGroup() - .optional(BINARY).as(UTF8).named("key") + .optional(BINARY).as(LogicalTypeAnnotation.stringType()).named("key") .required(INT32).named("value") .named("key_value") .named("value") diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/MyConverterParent.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/MyConverterParent.java new file mode 100644 index 0000000000..2229b3ab76 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/MyConverterParent.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.io.Writable; + +/** + * Helper class for TestETypeConverter. + */ +public class MyConverterParent implements ConverterParent { + + private Writable value; + + public Writable getValue() { + return value; + } + + @Override + public void set(int index, Writable value) { + this.value = value; + } + + @Override + public Map getMetadata() { + Map metadata = new HashMap<>(); + metadata.put(HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION.varname, "false"); + return metadata; + } + +} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java new file mode 100644 index 0000000000..80f43f0c03 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.io.parquet.convert.ETypeConverter.BinaryConverter; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.PrimitiveConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +/** + * Tests for class ETypeConverter. + */ +public class TestETypeConverter { + + @Test + public void testGetDecimalConverter() throws Exception { + TypeInfo hiveTypeInfo = new DecimalTypeInfo(7, 2); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.decimalType(2, 7)).named("value"); + Writable writable = getWritableFromBinaryConverter(hiveTypeInfo, primitiveType, Binary.fromString("155")); + HiveDecimalWritable decimalWritable = (HiveDecimalWritable) writable; + assertEquals(2, decimalWritable.getScale()); + } + + @Test + public void testGetTimestampConverter() throws Exception { + Timestamp timestamp = Timestamp.valueOf("2018-06-15 15:12:20.0"); + NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT96).named("value"); + Writable writable = getWritableFromBinaryConverter(null, primitiveType, nanoTime.toBinary()); + TimestampWritableV2 timestampWritable = (TimestampWritableV2) writable; + assertEquals(timestamp.getNanos(), timestampWritable.getNanos()); + } + + @Test + public void testGetTextConverter() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()).named("value"); + Writable writable = getWritableFromBinaryConverter(new VarcharTypeInfo(), primitiveType, + Binary.fromString("this_is_a_value")); + Text textWritable = (Text) writable; + assertEquals("this_is_a_value", textWritable.toString()); + } + + @Test + public void testGetTextConverterNoHiveTypeInfo() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()).named("value"); + Writable writable = + getWritableFromBinaryConverter(null, primitiveType, Binary.fromString("this_is_a_value")); + Text textWritable = (Text) writable; + assertEquals("this_is_a_value", textWritable.toString()); + } + + @Test + public void testGetIntConverterForTinyInt() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(8, false)).named("value"); + Writable writable = + getWritableFromPrimitiveConverter(createHiveTypeInfo("tinyint"), primitiveType, 125); + IntWritable intWritable = (IntWritable) writable; + assertEquals(125, intWritable.get()); + } + + @Test + public void testGetIntConverterForFloat() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT32).named("value"); + Writable writable = getWritableFromPrimitiveConverter(createHiveTypeInfo("float"), primitiveType, 22225); + FloatWritable floatWritable = (FloatWritable) writable; + assertEquals((float) 22225, (float) floatWritable.get(), 0); + } + + @Test + public void testGetIntConverterForBigint() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT32).named("value"); + Writable writable = getWritableFromPrimitiveConverter(createHiveTypeInfo("bigint"), primitiveType, 22225); + LongWritable longWritable = (LongWritable) writable; + assertEquals(22225, longWritable.get()); + } + + @Test + public void testGetIntConverterForDouble() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT32).named("value"); + Writable writable = getWritableFromPrimitiveConverter(createHiveTypeInfo("double"), primitiveType, 22225); + DoubleWritable doubleWritable = (DoubleWritable) writable; + assertEquals((double) 22225, (double) doubleWritable.get(), 0); + } + + @Test + public void testGetIntConverterForSmallint() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(16, false)).named("value"); + Writable writable = + getWritableFromPrimitiveConverter(createHiveTypeInfo("smallint"), primitiveType, 32766); + IntWritable intWritable = (IntWritable) writable; + assertEquals(32766, intWritable.get()); + } + + @Test + public void testGetIntConverterNoHiveTypeInfo() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT32).named("value"); + Writable writable = getWritableFromPrimitiveConverter(null, primitiveType, 12225); + IntWritable intWritable = (IntWritable) writable; + assertEquals(12225, intWritable.get()); + } + + @Test + public void testGetDoubleConverter() throws Exception { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.DOUBLE).named("value"); + PrimitiveConverter converter = ETypeConverter.getNewConverter(primitiveType, 1, converterParent, null); + ((PrimitiveConverter) converter).addDouble(3276); + Writable writable = converterParent.getValue(); + DoubleWritable doubleWritable = (DoubleWritable) writable; + assertEquals(3276, doubleWritable.get(), 0); + } + + @Test + public void testGetBooleanConverter() throws Exception { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.BOOLEAN).named("value"); + PrimitiveConverter converter = ETypeConverter.getNewConverter(primitiveType, 1, converterParent, null); + ((PrimitiveConverter) converter).addBoolean(true); + Writable writable = converterParent.getValue(); + BooleanWritable booleanWritable = (BooleanWritable) writable; + assertEquals(true, booleanWritable.get()); + } + + @Test + public void testGetFloatConverter() throws Exception { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.FLOAT).named("value"); + PrimitiveConverter converter = ETypeConverter.getNewConverter(primitiveType, 1, converterParent, null); + ((PrimitiveConverter) converter).addFloat(3276f); + Writable writable = converterParent.getValue(); + FloatWritable floatWritable = (FloatWritable) writable; + assertEquals(3276f, floatWritable.get(), 0); + } + + @Test + public void testGetFloatConverterForDouble() throws Exception { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.FLOAT).named("value"); + PrimitiveConverter converter = + ETypeConverter.getNewConverter(primitiveType, 1, converterParent, createHiveTypeInfo("double")); + ((PrimitiveConverter) converter).addFloat(3276f); + Writable writable = converterParent.getValue(); + DoubleWritable doubleWritable = (DoubleWritable) writable; + assertEquals(3276d, doubleWritable.get(), 0); + } + + @Test + public void testGetBinaryConverter() throws Exception { + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.BINARY).named("value"); + Writable writable = getWritableFromBinaryConverter(null, primitiveType, Binary.fromString("this_is_a_value")); + BytesWritable byteWritable = (BytesWritable) writable; + assertEquals("this_is_a_value", new String(byteWritable.getBytes())); + } + + @Test + public void testGetLongConverter() throws Exception { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT64).named("value"); + PrimitiveConverter converter = ETypeConverter.getNewConverter(primitiveType, 1, converterParent, null); + ((PrimitiveConverter) converter).addLong(12225); + Writable writable = converterParent.getValue(); + LongWritable longWritable = (LongWritable) writable; + assertEquals(12225L, longWritable.get()); + } + + @Test + public void testGetConverterForList() { + MyConverterParent converterParent = new MyConverterParent(); + GroupType type = + Types.optionalList().element(Types.optional(PrimitiveTypeName.INT64).named("value")).named("array"); + HiveGroupConverter f = HiveGroupConverter.getConverterFromDescription(type, 1, converterParent, null); + assertTrue(f instanceof HiveCollectionConverter); + } + + @Test + public void testGetConverterForMap() { + MyConverterParent converterParent = new MyConverterParent(); + GroupType type = Types.optionalMap().key(Types.optional(PrimitiveTypeName.INT64).named("key")) + .value(Types.optional(PrimitiveTypeName.INT64).named("value")).named("map"); + HiveGroupConverter f = HiveGroupConverter.getConverterFromDescription(type, 1, converterParent, null); + assertTrue(f instanceof HiveCollectionConverter); + } + + @Test + public void testGetConverterForStruct() { + MyConverterParent converterParent = new MyConverterParent(); + GroupType type = Types.buildGroup(Repetition.OPTIONAL).named("struct"); + HiveGroupConverter f = HiveGroupConverter.getConverterFromDescription(type, 1, converterParent, null); + assertTrue(f instanceof HiveStructConverter); + } + + private Writable getWritableFromBinaryConverter(TypeInfo hiveTypeInfo, PrimitiveType primitiveType, + Binary valueToAdd) { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveConverter converter = ETypeConverter.getNewConverter(primitiveType, 1, converterParent, hiveTypeInfo); + ((BinaryConverter) converter).addBinary(valueToAdd); + return converterParent.getValue(); + } + + private Writable getWritableFromPrimitiveConverter(TypeInfo hiveTypeInfo, PrimitiveType primitiveType, + Integer valueToAdd) { + MyConverterParent converterParent = new MyConverterParent(); + PrimitiveConverter converter = ETypeConverter.getNewConverter(primitiveType, 1, converterParent, hiveTypeInfo); + ((PrimitiveConverter) converter).addInt(valueToAdd); + return converterParent.getValue(); + } + + private PrimitiveTypeInfo createHiveTypeInfo(String typeName) { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName(typeName); + return hiveTypeInfo; + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java new file mode 100644 index 0000000000..af26068720 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import static org.junit.Assert.assertTrue; + +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.DefaultParquetDataColumnReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromBooleanPageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromDecimalPageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromDoublePageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromFloatPageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromInt32PageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromInt64PageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromInt96PageReader; +import org.apache.hadoop.hive.ql.io.parquet.vector.ParquetDataColumnReaderFactory.TypesFromStringPageReader; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +/** + * Tests for ParquetDataColumnReaderFactory#getDataColumnReaderByType + */ +public class TestGetDataColumnReaderByType { + + @Test + public void testGetDecimalReader() throws Exception { + TypeInfo hiveTypeInfo = new DecimalTypeInfo(7, 2); + ParquetDataColumnReader reader = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(20) + .as(LogicalTypeAnnotation.decimalType(2, 5)).named("value"), + hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromDecimalPageReader); + } + + @Test + public void testGetStringReader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("string"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory.getDataColumnReaderByType(Types + .optional(PrimitiveTypeName.BINARY).as(LogicalTypeAnnotation.stringType()).named("value"), + hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromStringPageReader); + } + + @Test + public void testGetDecimalReaderFromBinaryPrimitive() throws Exception { + TypeInfo hiveTypeInfo = new DecimalTypeInfo(7, 2); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.decimalType(2, 5)).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromDecimalPageReader); + } + + @Test + public void testGetBinaryReaderNoOriginalType() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("string"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.BINARY).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof DefaultParquetDataColumnReader); + } + + @Test + public void testGetBinaryReaderJsonOriginalType() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("binary"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory.getDataColumnReaderByType(Types + .optional(PrimitiveTypeName.BINARY).as(LogicalTypeAnnotation.jsonType()).named("value"), + hiveTypeInfo, null, true); + assertTrue(reader instanceof DefaultParquetDataColumnReader); + } + + @Test + public void testGetIntReader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("int"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(32, false)).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromInt32PageReader); + } + + @Test + public void testGetIntReaderNoOriginalType() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("int"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT32).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromInt32PageReader); + } + + @Test + public void testGetInt64ReaderNoOriginalType() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("bigint"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory.getDataColumnReaderByType( + Types.optional(PrimitiveTypeName.INT64).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromInt64PageReader); + } + + @Test + public void testGetInt64Reader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("bigint"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.intType(64, false)).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromInt64PageReader); + } + + @Test + public void testGetFloatReader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("float"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.FLOAT).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromFloatPageReader); + } + + @Test + public void testGetDoubleReader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("double"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.DOUBLE).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromDoublePageReader); + } + + @Test + public void testGetInt96Reader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("timestamp"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT96).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromInt96PageReader); + } + + @Test + public void testGetBooleanReader() throws Exception { + PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); + hiveTypeInfo.setTypeName("boolean"); + ParquetDataColumnReader reader = ParquetDataColumnReaderFactory + .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.BOOLEAN).named("value"), hiveTypeInfo, null, true); + assertTrue(reader instanceof TypesFromBooleanPageReader); + } +} diff --git ql/src/test/results/clientpositive/parquet_analyze.q.out ql/src/test/results/clientpositive/parquet_analyze.q.out index 16c836dd40..af3469109e 100644 --- ql/src/test/results/clientpositive/parquet_analyze.q.out +++ ql/src/test/results/clientpositive/parquet_analyze.q.out @@ -94,7 +94,7 @@ Table Parameters: numFiles 1 numRows 100 rawDataSize 700 - totalSize 6730 + totalSize 6927 #### A masked pattern was here #### # Storage Information @@ -141,8 +141,8 @@ Table Parameters: bucketing_version 2 numFiles 1 numRows 100 - rawDataSize 5936 - totalSize 6730 + rawDataSize 5774 + totalSize 6927 #### A masked pattern was here #### # Storage Information diff --git ql/src/test/results/clientpositive/parquet_vectorization_0.q.out ql/src/test/results/clientpositive/parquet_vectorization_0.q.out index 3698e9d95c..3d58e801f2 100644 --- ql/src/test/results/clientpositive/parquet_vectorization_0.q.out +++ ql/src/test/results/clientpositive/parquet_vectorization_0.q.out @@ -1697,7 +1697,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -1719,7 +1719,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30522,7 +30522,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30544,7 +30544,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30639,7 +30639,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30661,7 +30661,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30760,7 +30760,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30782,7 +30782,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30866,7 +30866,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30888,7 +30888,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet diff --git ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out index 1232957610..4b60aa6a8d 100644 --- ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out +++ ql/src/test/results/clientpositive/spark/parquet_vectorization_0.q.out @@ -1713,7 +1713,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -1735,7 +1735,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30543,7 +30543,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30565,7 +30565,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30663,7 +30663,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30685,7 +30685,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30787,7 +30787,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30809,7 +30809,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet @@ -30898,7 +30898,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe @@ -30920,7 +30920,7 @@ STAGE PLANS: serialization.ddl struct alltypesparquet { byte ctinyint, i16 csmallint, i32 cint, i64 cbigint, float cfloat, double cdouble, string cstring1, string cstring2, timestamp ctimestamp1, timestamp ctimestamp2, bool cboolean1, bool cboolean2} serialization.format 1 serialization.lib org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - totalSize 595141 + totalSize 595418 #### A masked pattern was here #### serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe name: default.alltypesparquet