diff --git data/files/parquet_types.txt data/files/parquet_types.txt index 9d81c3c3130cb94ae2bc308d511b0e24a60d4b8e..750626e1d4e3a010f9d231fb01d754c88a12289a 100644 --- data/files/parquet_types.txt +++ data/files/parquet_types.txt @@ -1,21 +1,21 @@ -100|1|1|1.0|0.0|abc|2011-01-01 01:01:01.111111111 -101|2|2|1.1|0.3|def|2012-02-02 02:02:02.222222222 -102|3|3|1.2|0.6|ghi|2013-03-03 03:03:03.333333333 -103|1|4|1.3|0.9|jkl|2014-04-04 04:04:04.444444444 -104|2|5|1.4|1.2|mno|2015-05-05 05:05:05.555555555 -105|3|1|1.0|1.5|pqr|2016-06-06 06:06:06.666666666 -106|1|2|1.1|1.8|stu|2017-07-07 07:07:07.777777777 -107|2|3|1.2|2.1|vwx|2018-08-08 08:08:08.888888888 -108|3|4|1.3|2.4|yza|2019-09-09 09:09:09.999999999 -109|1|5|1.4|2.7|bcd|2020-10-10 10:10:10.101010101 -110|2|1|1.0|3.0|efg|2021-11-11 11:11:11.111111111 -111|3|2|1.1|3.3|hij|2022-12-12 12:12:12.121212121 -112|1|3|1.2|3.6|klm|2023-01-02 13:13:13.131313131 -113|2|4|1.3|3.9|nop|2024-02-02 14:14:14.141414141 -114|3|5|1.4|4.2|qrs|2025-03-03 15:15:15.151515151 -115|1|1|1.0|4.5|tuv|2026-04-04 16:16:16.161616161 -116|2|2|1.1|4.8|wxy|2027-05-05 17:17:17.171717171 -117|3|3|1.2|5.1|zab|2028-06-06 18:18:18.181818181 -118|1|4|1.3|5.4|cde|2029-07-07 19:19:19.191919191 -119|2|5|1.4|5.7|fgh|2030-08-08 20:20:20.202020202 -120|3|1|1.0|6.0|ijk|2031-09-09 21:21:21.212121212 +100|1|1|1.0|0.0|abc|2011-01-01 01:01:01.111111111|a |a +101|2|2|1.1|0.3|def|2012-02-02 02:02:02.222222222|ab |ab +102|3|3|1.2|0.6|ghi|2013-03-03 03:03:03.333333333|abc|abc +103|1|4|1.3|0.9|jkl|2014-04-04 04:04:04.444444444|abcd|abcd +104|2|5|1.4|1.2|mno|2015-05-05 05:05:05.555555555|abcde|abcde +105|3|1|1.0|1.5|pqr|2016-06-06 06:06:06.666666666|abcdef|abcdef +106|1|2|1.1|1.8|stu|2017-07-07 07:07:07.777777777|abcdefg|abcdefg +107|2|3|1.2|2.1|vwx|2018-08-08 08:08:08.888888888|bcdefg|abcdefgh +108|3|4|1.3|2.4|yza|2019-09-09 09:09:09.999999999|cdefg|abcdefghijklmnop +109|1|5|1.4|2.7|bcd|2020-10-10 10:10:10.101010101|klmno|abcdedef +110|2|1|1.0|3.0|efg|2021-11-11 11:11:11.111111111|pqrst|abcdede +111|3|2|1.1|3.3|hij|2022-12-12 12:12:12.121212121|nopqr|abcded +112|1|3|1.2|3.6|klm|2023-01-02 13:13:13.131313131|opqrs|abcdd +113|2|4|1.3|3.9|nop|2024-02-02 14:14:14.141414141|pqrst|abc +114|3|5|1.4|4.2|qrs|2025-03-03 15:15:15.151515151|qrstu|b +115|1|1|1.0|4.5|tuv|2026-04-04 16:16:16.161616161|rstuv|abcded +116|2|2|1.1|4.8|wxy|2027-05-05 17:17:17.171717171|stuvw|abcded +117|3|3|1.2|5.1|zab|2028-06-06 18:18:18.181818181|tuvwx|abcded +118|1|4|1.3|5.4|cde|2029-07-07 19:19:19.191919191|uvwzy|abcdede +119|2|5|1.4|5.7|fgh|2030-08-08 20:20:20.202020202|vwxyz|abcdede +120|3|1|1.0|6.0|ijk|2031-09-09 21:21:21.212121212|wxyza|abcde diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java index 582a5dfdaccaa25d46bfb515248eeb4bb84bedc5..c5d80f22b82e57c5acf8286d879a248a233aa051 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java @@ -13,6 +13,9 @@ */ package org.apache.hadoop.hive.ql.io.parquet.convert; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.Writable; @@ -30,7 +33,7 @@ private Writable[] mapPairContainer; public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, - final int index) { + final int index, List hiveSchemaTypeInfos) { this.parent = parent; this.index = index; int count = groupType.getFieldCount(); @@ -40,7 +43,8 @@ public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupCon isMap = count == 2; converters = new Converter[count]; for (int i = 0; i < count; i++) { - converters[i] = getConverterFromDescription(groupType.getType(i), i, this); + converters[i] = getConverterFromDescription(groupType.getType(i), i, this, + hiveSchemaTypeInfos); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java index 0e310fbfb748d5409ff3c0d8cd8327bec9988ecf..48e4a133d1b30ef43a53e1a6c19b68682e86835f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java @@ -16,6 +16,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.Writable; @@ -36,19 +37,21 @@ private final Object[] currentArr; private Writable[] rootMap; - public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema) { - this(requestedSchema, null, 0, tableSchema); + public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema, + final List hiveSchemaTypeInfos) { + this(requestedSchema, null, 0, tableSchema, hiveSchemaTypeInfos); final int fieldCount = tableSchema.getFieldCount(); this.rootMap = new Writable[fieldCount]; } public DataWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, - final int index) { - this(groupType, parent, index, groupType); + final int index, final List hiveSchemaTypeInfos) { + this(groupType, parent, index, groupType, hiveSchemaTypeInfos); } public DataWritableGroupConverter(final GroupType selectedGroupType, - final HiveGroupConverter parent, final int index, final GroupType containingGroupType) { + final HiveGroupConverter parent, final int index, final GroupType containingGroupType, + final List hiveSchemaTypeInfos) { this.parent = parent; this.index = index; final int totalFieldCount = containingGroupType.getFieldCount(); @@ -62,7 +65,8 @@ public DataWritableGroupConverter(final GroupType selectedGroupType, Type subtype = selectedFields.get(i); if (containingGroupType.getFields().contains(subtype)) { converters[i] = getConverterFromDescription(subtype, - containingGroupType.getFieldIndex(subtype.getName()), this); + containingGroupType.getFieldIndex(subtype.getName()), this, + hiveSchemaTypeInfos); } else { throw new IllegalStateException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java index 7762afea4dda8cb4be4756eef43abec566ea8444..0971a68e151cb1a0469671f119b479719f36fa6a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java @@ -13,6 +13,9 @@ */ package org.apache.hadoop.hive.ql.io.parquet.convert; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.ArrayWritable; import parquet.io.api.GroupConverter; @@ -28,8 +31,10 @@ private final DataWritableGroupConverter root; - public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema) { - this.root = new DataWritableGroupConverter(requestedSchema, tableSchema); + public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema, + final List hiveColumnTypeInfos) { + this.root = new DataWritableGroupConverter(requestedSchema, tableSchema, + hiveColumnTypeInfos); } @Override @@ -41,4 +46,4 @@ public ArrayWritable getCurrentRecord() { public GroupConverter getRootConverter() { return root; } -} +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java index 67ce15187a33d58fda7ff5b629339bd89d0e5e54..e6fb5ae137a1c91953c2458897d98d109586e9d6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -16,12 +16,19 @@ import java.math.BigDecimal; import java.sql.Timestamp; import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; @@ -145,6 +152,32 @@ protected TimestampWritable convert(Binary binary) { } }; } + }, + ECHAR_CONVERTER(HiveCharWritable.class) { + @Override + Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) { + return new BinaryConverter(type, parent, index) { + @Override + protected HiveCharWritable convert(Binary binary) { + HiveChar hiveChar = new HiveChar(); + hiveChar.setValue(binary.toStringUsingUTF8()); + return new HiveCharWritable(hiveChar); + } + }; + } + }, + EVARCHAR_CONVERTER(HiveVarcharWritable.class) { + @Override + Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) { + return new BinaryConverter(type, parent, index) { + @Override + protected HiveVarcharWritable convert(Binary binary) { + HiveVarchar hiveVarchar = new HiveVarchar(); + hiveVarchar.setValue(binary.toStringUsingUTF8()); + return new HiveVarcharWritable(hiveVarchar); + } + }; + } }; final Class _type; @@ -159,7 +192,8 @@ private ETypeConverter(final Class type) { abstract Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent); - public static Converter getNewConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) { + public static Converter getNewConverter(final PrimitiveType type, final int index, + final HiveGroupConverter parent, List hiveSchemaTypeInfos) { if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) { //TODO- cleanup once parquet support Timestamp type annotation. return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent); @@ -167,7 +201,15 @@ public static Converter getNewConverter(final PrimitiveType type, final int inde if (OriginalType.DECIMAL == type.getOriginalType()) { return EDECIMAL_CONVERTER.getConverter(type, index, parent); } else if (OriginalType.UTF8 == type.getOriginalType()) { - return ESTRING_CONVERTER.getConverter(type, index, parent); + if (hiveSchemaTypeInfos.get(index).getTypeName() + .startsWith(serdeConstants.CHAR_TYPE_NAME)) { + return ECHAR_CONVERTER.getConverter(type, index, parent); + } else if (hiveSchemaTypeInfos.get(index).getTypeName() + .startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { + return EVARCHAR_CONVERTER.getConverter(type, index, parent); + } else if (type.isPrimitive()) { + return ESTRING_CONVERTER.getConverter(type, index, parent); + } } Class javaType = type.getPrimitiveTypeName().javaType; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java index 524a2937e39a4821a856c8e25b14633ade89ea49..a364729505eaa7b0b0c9b0c326a8a6398b8b3dbe 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java @@ -13,6 +13,9 @@ */ package org.apache.hadoop.hive.ql.io.parquet.convert; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.Writable; import parquet.io.api.Converter; @@ -23,17 +26,20 @@ public abstract class HiveGroupConverter extends GroupConverter { protected static Converter getConverterFromDescription(final Type type, final int index, - final HiveGroupConverter parent) { + final HiveGroupConverter parent, List hiveSchemaTypeInfos) { if (type == null) { return null; } if (type.isPrimitive()) { - return ETypeConverter.getNewConverter(type.asPrimitiveType(), index, parent); + return ETypeConverter.getNewConverter(type.asPrimitiveType(), index, parent, + hiveSchemaTypeInfos); } else { if (type.asGroupType().getRepetition() == Repetition.REPEATED) { - return new ArrayWritableGroupConverter(type.asGroupType(), parent, index); + return new ArrayWritableGroupConverter(type.asGroupType(), parent, index, + hiveSchemaTypeInfos); } else { - return new DataWritableGroupConverter(type.asGroupType(), parent, index); + return new DataWritableGroupConverter(type.asGroupType(), parent, index, + hiveSchemaTypeInfos); } } } @@ -42,4 +48,4 @@ protected static Converter getConverterFromDescription(final Type type, final in protected abstract void add(int index, Writable value); -} +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java index 99901f0f57328db6fb2a260f7b7d76ded6f39558..3116451edb06d9cc47ec63a482e66ddd1e489ccb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -16,6 +16,7 @@ import java.util.List; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; @@ -25,7 +26,6 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import parquet.schema.ConversionPatterns; -import parquet.schema.DecimalMetadata; import parquet.schema.GroupType; import parquet.schema.MessageType; import parquet.schema.OriginalType; @@ -81,6 +81,14 @@ private static Type convertType(final String name, final TypeInfo typeInfo, fina return new PrimitiveType(repetition, PrimitiveTypeName.INT96, name); } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { throw new UnsupportedOperationException("Void type not implemented"); + } else if (typeInfo.getTypeName().toLowerCase().startsWith( + serdeConstants.CHAR_TYPE_NAME)) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) + .named(name); + } else if (typeInfo.getTypeName().toLowerCase().startsWith( + serdeConstants.VARCHAR_TYPE_NAME)) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) + .named(name); } else if (typeInfo instanceof DecimalTypeInfo) { DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; int prec = decimalTypeInfo.precision(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index ecf335f3ee4695c4a9cd8e5d82e3a61e99a107bc..3b9bf433175309bac15fbe857b964aac857e06d8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -14,6 +14,7 @@ package org.apache.hadoop.hive.ql.io.parquet.read; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -23,6 +24,8 @@ import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.util.StringUtils; @@ -60,6 +63,28 @@ return (List) VirtualColumn. removeVirtualColumns(StringUtils.getStringCollection(columns)); } + + private static List getColumnTypes(Configuration configuration) { + + List columnNames; + String columnNamesProperty = configuration.get(IOConstants.COLUMNS); + if (columnNamesProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNamesProperty.split(",")); + } + List columnTypes; + String columnTypesProperty = configuration.get(IOConstants.COLUMNS_TYPES); + if (columnTypesProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypesProperty); + } + + columnTypes = VirtualColumn.removeVirtualColumnTypes(columnNames, columnTypes); + return columnTypes; + } + /** * * It creates the readContext for Parquet side with the requested schema during the init phase. @@ -148,7 +173,8 @@ } final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser. parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration); - return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); + return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema, + getColumnTypes(configuration)); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java index 47bf69ce7cb6f474f9f48dd693a7915475a1d9cb..d5aae3b3e1c83521c8daa431e3a92a96d98b4d46 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java @@ -25,12 +25,14 @@ import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.ArrayWritable; /** @@ -102,12 +104,10 @@ private ObjectInspector getObjectInspector(final TypeInfo typeInfo) { return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; } else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) { throw new UnsupportedOperationException("Parquet does not support date. See HIVE-6384"); - } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.DECIMAL_TYPE_NAME)) { - throw new UnsupportedOperationException("Parquet does not support decimal. See HIVE-6384"); } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.CHAR_TYPE_NAME)) { - throw new UnsupportedOperationException("Parquet does not support char. See HIVE-6384"); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((CharTypeInfo) typeInfo); } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { - throw new UnsupportedOperationException("Parquet does not support varchar. See HIVE-6384"); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((VarcharTypeInfo) typeInfo); } else { throw new UnsupportedOperationException("Unknown field type: " + typeInfo); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java index e3e327c7b657cdd397dd2b4dddf40187c65ce901..e5c663e8062c9b635bd07bb48568fa5f7e3086fe 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -42,6 +42,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; @@ -60,6 +62,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import parquet.io.api.Binary; /** * @@ -280,6 +283,12 @@ private Writable createPrimitive(final Object obj, final PrimitiveObjectInspecto return new BytesWritable(tgt); case TIMESTAMP: return new TimestampWritable(((TimestampObjectInspector) inspector).getPrimitiveJavaObject(obj)); + case CHAR: + String strippedValue = ((HiveCharObjectInspector) inspector).getPrimitiveJavaObject(obj).getStrippedValue(); + return new BytesWritable(Binary.fromString(strippedValue).getBytes()); + case VARCHAR: + String value = ((HiveVarcharObjectInspector) inspector).getPrimitiveJavaObject(obj).getValue(); + return new BytesWritable(Binary.fromString(value).getBytes()); default: throw new SerDeException("Unknown primitive : " + inspector.getPrimitiveCategory()); } diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java index 27531b477b53ea037676da78a513ad9a15d8bd68..ecc5d92e5012caa6c1354a32a78f81785f4ae5ae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java @@ -22,7 +22,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.ListIterator; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.conf.HiveConf; @@ -60,8 +63,9 @@ public static final VirtualColumn GROUPINGID = new VirtualColumn("GROUPING__ID", (PrimitiveTypeInfo) TypeInfoFactory.intTypeInfo); - public static final VirtualColumn[] VIRTUAL_COLUMNS = - new VirtualColumn[] {FILENAME, BLOCKOFFSET, ROWOFFSET, RAWDATASIZE, GROUPINGID, ROWID}; + public static ImmutableSet VIRTUAL_COLUMN_NAMES = + ImmutableSet.of(FILENAME.getName(), BLOCKOFFSET.getName(), ROWOFFSET.getName(), + RAWDATASIZE.getName(), GROUPINGID.getName(), ROWID.getName()); private final String name; private final TypeInfo typeInfo; @@ -139,12 +143,29 @@ public int hashCode() { return 31 * typeInfo.getTypeName().hashCode() + c; } public static Collection removeVirtualColumns(final Collection columns) { - for(VirtualColumn vcol : VIRTUAL_COLUMNS) { - columns.remove(vcol.getName()); - } + Iterables.removeAll(columns, VIRTUAL_COLUMN_NAMES); return columns; } + public static List removeVirtualColumnTypes(final List columnNames, + final List columnTypes) { + if (columnNames.size() != columnTypes.size()) { + throw new IllegalArgumentException("Number of column names in configuration " + + columnNames.size() + " differs from column types " + columnTypes.size()); + } + + int i = 0; + ListIterator it = columnTypes.listIterator(); + while(it.hasNext()) { + it.next(); + if (VIRTUAL_COLUMN_NAMES.contains(columnNames.get(i))) { + it.remove(); + } + ++i; + } + return columnTypes; + } + public static StructObjectInspector getVCSObjectInspector(List vcs) { List names = new ArrayList(vcs.size()); List inspectors = new ArrayList(vcs.size()); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index ba4ac690ccc361e65f12220997f300067bbd0d6c..a62209557bb7a5af40e4634ae53be15a3b50aeb6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -927,11 +927,9 @@ private boolean validateExprNodeDescRecursive(ExprNodeDesc desc) { if (desc instanceof ExprNodeColumnDesc) { ExprNodeColumnDesc c = (ExprNodeColumnDesc) desc; // Currently, we do not support vectorized virtual columns (see HIVE-5570). - for (VirtualColumn vc : VirtualColumn.VIRTUAL_COLUMNS) { - if (c.getColumn().equals(vc.getName())) { - LOG.info("Cannot vectorize virtual column " + c.getColumn()); - return false; - } + if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(c.getColumn())) { + LOG.info("Cannot vectorize virtual column " + c.getColumn()); + return false; } } String typeName = desc.getTypeInfo().getTypeName(); @@ -1076,10 +1074,8 @@ private boolean isVirtualColumn(ColumnInfo column) { // Not using method column.getIsVirtualCol() because partitioning columns are also // treated as virtual columns in ColumnInfo. - for (VirtualColumn vc : VirtualColumn.VIRTUAL_COLUMNS) { - if (column.getInternalName().equals(vc.getName())) { + if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(column.getInternalName())) { return true; - } } return false; } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java index b87cf7449679a9b6da997010056e388fb3de9945..f232c57be23a9e847adba9925c61172c51b2196a 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java @@ -88,6 +88,26 @@ public void testDecimalType() throws Exception { } @Test + public void testCharType() throws Exception { + testConversion( + "a", + "char(5)", + "message hive_schema {\n" + + " optional binary a (UTF8);\n" + + "}\n"); + } + + @Test + public void testVarcharType() throws Exception { + testConversion( + "a", + "varchar(10)", + "message hive_schema {\n" + + " optional binary a (UTF8);\n" + + "}\n"); + } + + @Test public void testArray() throws Exception { testConversion("arrayCol", "array", diff --git ql/src/test/queries/clientnegative/parquet_char.q ql/src/test/queries/clientnegative/parquet_char.q deleted file mode 100644 index 745a7867264e321c079d8146f60d14ae186bbc29..0000000000000000000000000000000000000000 --- ql/src/test/queries/clientnegative/parquet_char.q +++ /dev/null @@ -1,3 +0,0 @@ -drop table if exists parquet_char; - -create table parquet_char (t char(10)) stored as parquet; diff --git ql/src/test/queries/clientnegative/parquet_varchar.q ql/src/test/queries/clientnegative/parquet_varchar.q deleted file mode 100644 index 55825f76dc240c54ef451ceec12adee23f12b36c..0000000000000000000000000000000000000000 --- ql/src/test/queries/clientnegative/parquet_varchar.q +++ /dev/null @@ -1,3 +0,0 @@ -drop table if exists parquet_varchar; - -create table parquet_varchar (t varchar(10)) stored as parquet; diff --git ql/src/test/queries/clientpositive/parquet_types.q ql/src/test/queries/clientpositive/parquet_types.q index cb0dcfdf2d637854a84b165f8565fcb683617696..86af5af40bbb95472d7ef5df6519469cba9a129d 100644 --- ql/src/test/queries/clientpositive/parquet_types.q +++ ql/src/test/queries/clientpositive/parquet_types.q @@ -8,7 +8,9 @@ CREATE TABLE parquet_types_staging ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'; @@ -19,7 +21,9 @@ CREATE TABLE parquet_types ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) STORED AS PARQUET; LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging; diff --git ql/src/test/results/clientnegative/parquet_char.q.out ql/src/test/results/clientnegative/parquet_char.q.out deleted file mode 100644 index 8c9a52c63416eaeaf99cb51b9f386f886483f29c..0000000000000000000000000000000000000000 --- ql/src/test/results/clientnegative/parquet_char.q.out +++ /dev/null @@ -1,9 +0,0 @@ -PREHOOK: query: drop table if exists parquet_char -PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table if exists parquet_char -POSTHOOK: type: DROPTABLE -PREHOOK: query: create table parquet_char (t char(10)) stored as parquet -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@parquet_char -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.UnsupportedOperationException: Parquet does not support char. See HIVE-6384 diff --git ql/src/test/results/clientnegative/parquet_timestamp.q.out ql/src/test/results/clientnegative/parquet_timestamp.q.out deleted file mode 100644 index 00973b7e1f6360ce830a8baa4b959491ccc87a9b..0000000000000000000000000000000000000000 --- ql/src/test/results/clientnegative/parquet_timestamp.q.out +++ /dev/null @@ -1,8 +0,0 @@ -PREHOOK: query: drop table if exists parquet_timestamp -PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table if exists parquet_timestamp -POSTHOOK: type: DROPTABLE -PREHOOK: query: create table parquet_timestamp (t timestamp) stored as parquet -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.UnsupportedOperationException: Parquet does not support timestamp. See HIVE-6384 diff --git ql/src/test/results/clientnegative/parquet_varchar.q.out ql/src/test/results/clientnegative/parquet_varchar.q.out deleted file mode 100644 index 90f6db25960825472270532811b8a17d9774d412..0000000000000000000000000000000000000000 --- ql/src/test/results/clientnegative/parquet_varchar.q.out +++ /dev/null @@ -1,9 +0,0 @@ -PREHOOK: query: drop table if exists parquet_varchar -PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table if exists parquet_varchar -POSTHOOK: type: DROPTABLE -PREHOOK: query: create table parquet_varchar (t varchar(10)) stored as parquet -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@parquet_varchar -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.UnsupportedOperationException: Parquet does not support varchar. See HIVE-6384 diff --git ql/src/test/results/clientpositive/parquet_types.q.out ql/src/test/results/clientpositive/parquet_types.q.out index 3acb0520bab023238e19b728ffedc3344c7f1a06..803a826ba0c386af784dd24c0455ac1939af380b 100644 --- ql/src/test/results/clientpositive/parquet_types.q.out +++ ql/src/test/results/clientpositive/parquet_types.q.out @@ -13,7 +13,9 @@ PREHOOK: query: CREATE TABLE parquet_types_staging ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' PREHOOK: type: CREATETABLE @@ -26,7 +28,9 @@ POSTHOOK: query: CREATE TABLE parquet_types_staging ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' POSTHOOK: type: CREATETABLE @@ -39,7 +43,9 @@ PREHOOK: query: CREATE TABLE parquet_types ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) STORED AS PARQUET PREHOOK: type: CREATETABLE PREHOOK: Output: database:default @@ -51,7 +57,9 @@ POSTHOOK: query: CREATE TABLE parquet_types ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) STORED AS PARQUET POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default @@ -72,12 +80,14 @@ POSTHOOK: query: INSERT OVERWRITE TABLE parquet_types SELECT * FROM parquet_type POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_types_staging POSTHOOK: Output: default@parquet_types +POSTHOOK: Lineage: parquet_types.cchar SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cchar, type:char(5), comment:null), ] POSTHOOK: Lineage: parquet_types.cdouble SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cdouble, type:double, comment:null), ] POSTHOOK: Lineage: parquet_types.cfloat SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cfloat, type:float, comment:null), ] POSTHOOK: Lineage: parquet_types.cint SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cint, type:int, comment:null), ] POSTHOOK: Lineage: parquet_types.csmallint SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:csmallint, type:smallint, comment:null), ] POSTHOOK: Lineage: parquet_types.cstring1 SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cstring1, type:string, comment:null), ] POSTHOOK: Lineage: parquet_types.ctinyint SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:ctinyint, type:tinyint, comment:null), ] +POSTHOOK: Lineage: parquet_types.cvarchar SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ] POSTHOOK: Lineage: parquet_types.t SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:t, type:timestamp, comment:null), ] PREHOOK: query: SELECT * FROM parquet_types PREHOOK: type: QUERY @@ -87,27 +97,27 @@ POSTHOOK: query: SELECT * FROM parquet_types POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_types #### A masked pattern was here #### -100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 -101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 -102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 -103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 -104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 -105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 -106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 -107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 -108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 -109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 -110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 -111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 -112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 -113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 -114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 -115 1 1 1.0 4.5 tuv 2026-04-04 16:16:16.161616161 -116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 -117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 -118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 -119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 -120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 +100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 a a +101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 ab ab +102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 abc abc +103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 abcd abcd +104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 abcde abcde +105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 abcde abcdef +106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 abcde abcdefg +107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 bcdef abcdefgh +108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 cdefg abcdefghij +109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 klmno abcdedef +110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 pqrst abcdede +111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 nopqr abcded +112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 opqrs abcdd +113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 pqrst abc +114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 qrstu b +115 1 1 1.0 4.5 tuv 2026-04-04 16:16:16.161616161 rstuv abcded +116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 stuvw abcded +117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 tuvwx abcded +118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 uvwzy abcdede +119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 vwxyz abcdede +120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 wxyza abcde PREHOOK: query: SELECT ctinyint, MAX(cint), MIN(csmallint), diff --git serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java index ade3b5f081eb71e5cf4e639aff8bff6447d68dfc..df27db2e857ef626dc031c3ee34f89d382a0ebae 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java +++ serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java @@ -52,7 +52,7 @@ * Deserialize an object out of a Writable blob. In most cases, the return * value of this function will be constant since the function will reuse the * returned object. If the client wants to keep a copy of the object, the - * client needs to clone the returnDeserializered value by calling + * client needs to clone the returned deserialized value by calling * ObjectInspectorUtils.getStandardObject(). * * @param blob diff --git serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java index e7f3f4837ab253a825a7210f56f595b2403e7385..70dc18158122d5a81239c4975eaf818094a3c8e8 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java +++ serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java @@ -26,10 +26,11 @@ * Stores information about a type. Always use the TypeInfoFactory to create new * TypeInfo objects. * - * We support 5 categories of types: 1. Primitive objects (String, Number, etc) + * We support 8 categories of types: 1. Primitive objects (String, Number, etc) * 2. List objects (a list of objects of a single type) 3. Map objects (a map * from objects of one type to objects of another type) 4. Struct objects (a * list of fields with names and their own types) 5. Union objects + * 6. Decimal objects 7. Char objects 8. Varchar objects */ public abstract class TypeInfo implements Serializable {