diff --git data/files/parquet_types.txt data/files/parquet_types.txt index 9d81c3c..750626e 100644 --- data/files/parquet_types.txt +++ data/files/parquet_types.txt @@ -1,21 +1,21 @@ -100|1|1|1.0|0.0|abc|2011-01-01 01:01:01.111111111 -101|2|2|1.1|0.3|def|2012-02-02 02:02:02.222222222 -102|3|3|1.2|0.6|ghi|2013-03-03 03:03:03.333333333 -103|1|4|1.3|0.9|jkl|2014-04-04 04:04:04.444444444 -104|2|5|1.4|1.2|mno|2015-05-05 05:05:05.555555555 -105|3|1|1.0|1.5|pqr|2016-06-06 06:06:06.666666666 -106|1|2|1.1|1.8|stu|2017-07-07 07:07:07.777777777 -107|2|3|1.2|2.1|vwx|2018-08-08 08:08:08.888888888 -108|3|4|1.3|2.4|yza|2019-09-09 09:09:09.999999999 -109|1|5|1.4|2.7|bcd|2020-10-10 10:10:10.101010101 -110|2|1|1.0|3.0|efg|2021-11-11 11:11:11.111111111 -111|3|2|1.1|3.3|hij|2022-12-12 12:12:12.121212121 -112|1|3|1.2|3.6|klm|2023-01-02 13:13:13.131313131 -113|2|4|1.3|3.9|nop|2024-02-02 14:14:14.141414141 -114|3|5|1.4|4.2|qrs|2025-03-03 15:15:15.151515151 -115|1|1|1.0|4.5|tuv|2026-04-04 16:16:16.161616161 -116|2|2|1.1|4.8|wxy|2027-05-05 17:17:17.171717171 -117|3|3|1.2|5.1|zab|2028-06-06 18:18:18.181818181 -118|1|4|1.3|5.4|cde|2029-07-07 19:19:19.191919191 -119|2|5|1.4|5.7|fgh|2030-08-08 20:20:20.202020202 -120|3|1|1.0|6.0|ijk|2031-09-09 21:21:21.212121212 +100|1|1|1.0|0.0|abc|2011-01-01 01:01:01.111111111|a |a +101|2|2|1.1|0.3|def|2012-02-02 02:02:02.222222222|ab |ab +102|3|3|1.2|0.6|ghi|2013-03-03 03:03:03.333333333|abc|abc +103|1|4|1.3|0.9|jkl|2014-04-04 04:04:04.444444444|abcd|abcd +104|2|5|1.4|1.2|mno|2015-05-05 05:05:05.555555555|abcde|abcde +105|3|1|1.0|1.5|pqr|2016-06-06 06:06:06.666666666|abcdef|abcdef +106|1|2|1.1|1.8|stu|2017-07-07 07:07:07.777777777|abcdefg|abcdefg +107|2|3|1.2|2.1|vwx|2018-08-08 08:08:08.888888888|bcdefg|abcdefgh +108|3|4|1.3|2.4|yza|2019-09-09 09:09:09.999999999|cdefg|abcdefghijklmnop +109|1|5|1.4|2.7|bcd|2020-10-10 10:10:10.101010101|klmno|abcdedef +110|2|1|1.0|3.0|efg|2021-11-11 11:11:11.111111111|pqrst|abcdede +111|3|2|1.1|3.3|hij|2022-12-12 12:12:12.121212121|nopqr|abcded +112|1|3|1.2|3.6|klm|2023-01-02 13:13:13.131313131|opqrs|abcdd +113|2|4|1.3|3.9|nop|2024-02-02 14:14:14.141414141|pqrst|abc +114|3|5|1.4|4.2|qrs|2025-03-03 15:15:15.151515151|qrstu|b +115|1|1|1.0|4.5|tuv|2026-04-04 16:16:16.161616161|rstuv|abcded +116|2|2|1.1|4.8|wxy|2027-05-05 17:17:17.171717171|stuvw|abcded +117|3|3|1.2|5.1|zab|2028-06-06 18:18:18.181818181|tuvwx|abcded +118|1|4|1.3|5.4|cde|2029-07-07 19:19:19.191919191|uvwzy|abcdede +119|2|5|1.4|5.7|fgh|2030-08-08 20:20:20.202020202|vwxyz|abcdede +120|3|1|1.0|6.0|ijk|2031-09-09 21:21:21.212121212|wxyza|abcde diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java index 582a5df..c5d80f2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java @@ -13,6 +13,9 @@ */ package org.apache.hadoop.hive.ql.io.parquet.convert; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.Writable; @@ -30,7 +33,7 @@ private Writable[] mapPairContainer; public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, - final int index) { + final int index, List hiveSchemaTypeInfos) { this.parent = parent; this.index = index; int count = groupType.getFieldCount(); @@ -40,7 +43,8 @@ public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupCon isMap = count == 2; converters = new Converter[count]; for (int i = 0; i < count; i++) { - converters[i] = getConverterFromDescription(groupType.getType(i), i, this); + converters[i] = getConverterFromDescription(groupType.getType(i), i, this, + hiveSchemaTypeInfos); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java index 0e310fb..48e4a13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java @@ -16,6 +16,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.Writable; @@ -36,19 +37,21 @@ private final Object[] currentArr; private Writable[] rootMap; - public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema) { - this(requestedSchema, null, 0, tableSchema); + public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema, + final List hiveSchemaTypeInfos) { + this(requestedSchema, null, 0, tableSchema, hiveSchemaTypeInfos); final int fieldCount = tableSchema.getFieldCount(); this.rootMap = new Writable[fieldCount]; } public DataWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, - final int index) { - this(groupType, parent, index, groupType); + final int index, final List hiveSchemaTypeInfos) { + this(groupType, parent, index, groupType, hiveSchemaTypeInfos); } public DataWritableGroupConverter(final GroupType selectedGroupType, - final HiveGroupConverter parent, final int index, final GroupType containingGroupType) { + final HiveGroupConverter parent, final int index, final GroupType containingGroupType, + final List hiveSchemaTypeInfos) { this.parent = parent; this.index = index; final int totalFieldCount = containingGroupType.getFieldCount(); @@ -62,7 +65,8 @@ public DataWritableGroupConverter(final GroupType selectedGroupType, Type subtype = selectedFields.get(i); if (containingGroupType.getFields().contains(subtype)) { converters[i] = getConverterFromDescription(subtype, - containingGroupType.getFieldIndex(subtype.getName()), this); + containingGroupType.getFieldIndex(subtype.getName()), this, + hiveSchemaTypeInfos); } else { throw new IllegalStateException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java index 7762afe..0971a68 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java @@ -13,6 +13,9 @@ */ package org.apache.hadoop.hive.ql.io.parquet.convert; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.ArrayWritable; import parquet.io.api.GroupConverter; @@ -28,8 +31,10 @@ private final DataWritableGroupConverter root; - public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema) { - this.root = new DataWritableGroupConverter(requestedSchema, tableSchema); + public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema, + final List hiveColumnTypeInfos) { + this.root = new DataWritableGroupConverter(requestedSchema, tableSchema, + hiveColumnTypeInfos); } @Override @@ -41,4 +46,4 @@ public ArrayWritable getCurrentRecord() { public GroupConverter getRootConverter() { return root; } -} +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java index 67ce151..e6fb5ae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -16,12 +16,19 @@ import java.math.BigDecimal; import java.sql.Timestamp; import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; @@ -145,6 +152,32 @@ protected TimestampWritable convert(Binary binary) { } }; } + }, + ECHAR_CONVERTER(HiveCharWritable.class) { + @Override + Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) { + return new BinaryConverter(type, parent, index) { + @Override + protected HiveCharWritable convert(Binary binary) { + HiveChar hiveChar = new HiveChar(); + hiveChar.setValue(binary.toStringUsingUTF8()); + return new HiveCharWritable(hiveChar); + } + }; + } + }, + EVARCHAR_CONVERTER(HiveVarcharWritable.class) { + @Override + Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) { + return new BinaryConverter(type, parent, index) { + @Override + protected HiveVarcharWritable convert(Binary binary) { + HiveVarchar hiveVarchar = new HiveVarchar(); + hiveVarchar.setValue(binary.toStringUsingUTF8()); + return new HiveVarcharWritable(hiveVarchar); + } + }; + } }; final Class _type; @@ -159,7 +192,8 @@ private ETypeConverter(final Class type) { abstract Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent); - public static Converter getNewConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) { + public static Converter getNewConverter(final PrimitiveType type, final int index, + final HiveGroupConverter parent, List hiveSchemaTypeInfos) { if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) { //TODO- cleanup once parquet support Timestamp type annotation. return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent); @@ -167,7 +201,15 @@ public static Converter getNewConverter(final PrimitiveType type, final int inde if (OriginalType.DECIMAL == type.getOriginalType()) { return EDECIMAL_CONVERTER.getConverter(type, index, parent); } else if (OriginalType.UTF8 == type.getOriginalType()) { - return ESTRING_CONVERTER.getConverter(type, index, parent); + if (hiveSchemaTypeInfos.get(index).getTypeName() + .startsWith(serdeConstants.CHAR_TYPE_NAME)) { + return ECHAR_CONVERTER.getConverter(type, index, parent); + } else if (hiveSchemaTypeInfos.get(index).getTypeName() + .startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { + return EVARCHAR_CONVERTER.getConverter(type, index, parent); + } else if (type.isPrimitive()) { + return ESTRING_CONVERTER.getConverter(type, index, parent); + } } Class javaType = type.getPrimitiveTypeName().javaType; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java index 524a293..a364729 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java @@ -13,6 +13,9 @@ */ package org.apache.hadoop.hive.ql.io.parquet.convert; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.Writable; import parquet.io.api.Converter; @@ -23,17 +26,20 @@ public abstract class HiveGroupConverter extends GroupConverter { protected static Converter getConverterFromDescription(final Type type, final int index, - final HiveGroupConverter parent) { + final HiveGroupConverter parent, List hiveSchemaTypeInfos) { if (type == null) { return null; } if (type.isPrimitive()) { - return ETypeConverter.getNewConverter(type.asPrimitiveType(), index, parent); + return ETypeConverter.getNewConverter(type.asPrimitiveType(), index, parent, + hiveSchemaTypeInfos); } else { if (type.asGroupType().getRepetition() == Repetition.REPEATED) { - return new ArrayWritableGroupConverter(type.asGroupType(), parent, index); + return new ArrayWritableGroupConverter(type.asGroupType(), parent, index, + hiveSchemaTypeInfos); } else { - return new DataWritableGroupConverter(type.asGroupType(), parent, index); + return new DataWritableGroupConverter(type.asGroupType(), parent, index, + hiveSchemaTypeInfos); } } } @@ -42,4 +48,4 @@ protected static Converter getConverterFromDescription(final Type type, final in protected abstract void add(int index, Writable value); -} +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java index 99901f0..3116451 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -16,6 +16,7 @@ import java.util.List; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; @@ -25,7 +26,6 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import parquet.schema.ConversionPatterns; -import parquet.schema.DecimalMetadata; import parquet.schema.GroupType; import parquet.schema.MessageType; import parquet.schema.OriginalType; @@ -81,6 +81,14 @@ private static Type convertType(final String name, final TypeInfo typeInfo, fina return new PrimitiveType(repetition, PrimitiveTypeName.INT96, name); } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { throw new UnsupportedOperationException("Void type not implemented"); + } else if (typeInfo.getTypeName().toLowerCase().startsWith( + serdeConstants.CHAR_TYPE_NAME)) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) + .named(name); + } else if (typeInfo.getTypeName().toLowerCase().startsWith( + serdeConstants.VARCHAR_TYPE_NAME)) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) + .named(name); } else if (typeInfo instanceof DecimalTypeInfo) { DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; int prec = decimalTypeInfo.precision(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index d6be4bd..a91accb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -14,6 +14,7 @@ package org.apache.hadoop.hive.ql.io.parquet.read; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -23,6 +24,8 @@ import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.util.StringUtils; @@ -60,6 +63,28 @@ return (List) VirtualColumn. removeVirtualColumns(StringUtils.getStringCollection(columns)); } + + private static List getColumnTypes(Configuration configuration) { + + List columnNames; + String columnNamesProperty = configuration.get(IOConstants.COLUMNS); + if (columnNamesProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNamesProperty.split(",")); + } + List columnTypes; + String columnTypesProperty = configuration.get(IOConstants.COLUMNS_TYPES); + if (columnTypesProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypesProperty); + } + + columnTypes = VirtualColumn.removeVirtualColumnTypes(columnNames, columnTypes); + return columnTypes; + } + /** * * It creates the readContext for Parquet side with the requested schema during the init phase. @@ -146,7 +171,8 @@ } final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser. parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration); - return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); + return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema, + getColumnTypes(configuration)); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java index 47bf69c..d5aae3b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java @@ -25,12 +25,14 @@ import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.ArrayWritable; /** @@ -102,12 +104,10 @@ private ObjectInspector getObjectInspector(final TypeInfo typeInfo) { return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; } else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) { throw new UnsupportedOperationException("Parquet does not support date. See HIVE-6384"); - } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.DECIMAL_TYPE_NAME)) { - throw new UnsupportedOperationException("Parquet does not support decimal. See HIVE-6384"); } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.CHAR_TYPE_NAME)) { - throw new UnsupportedOperationException("Parquet does not support char. See HIVE-6384"); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((CharTypeInfo) typeInfo); } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { - throw new UnsupportedOperationException("Parquet does not support varchar. See HIVE-6384"); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((VarcharTypeInfo) typeInfo); } else { throw new UnsupportedOperationException("Unknown field type: " + typeInfo); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java index e3e327c..e5c663e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -42,6 +42,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; @@ -60,6 +62,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import parquet.io.api.Binary; /** * @@ -280,6 +283,12 @@ private Writable createPrimitive(final Object obj, final PrimitiveObjectInspecto return new BytesWritable(tgt); case TIMESTAMP: return new TimestampWritable(((TimestampObjectInspector) inspector).getPrimitiveJavaObject(obj)); + case CHAR: + String strippedValue = ((HiveCharObjectInspector) inspector).getPrimitiveJavaObject(obj).getStrippedValue(); + return new BytesWritable(Binary.fromString(strippedValue).getBytes()); + case VARCHAR: + String value = ((HiveVarcharObjectInspector) inspector).getPrimitiveJavaObject(obj).getValue(); + return new BytesWritable(Binary.fromString(value).getBytes()); default: throw new SerDeException("Unknown primitive : " + inspector.getPrimitiveCategory()); } diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java index 0637d46..918dde0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.ListIterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; @@ -30,6 +31,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; public class VirtualColumn implements Serializable { @@ -139,6 +141,27 @@ public boolean equals(Object o) { return columns; } + public static List removeVirtualColumnTypes(final List columnNames, + final List columnTypes) { + if (columnNames.size() != columnTypes.size()) { + throw new IllegalArgumentException("Number of column names in configuration " + + columnNames.size() + " differs from column types " + columnTypes.size()); + } + + int i = 0; + ListIterator it = columnTypes.listIterator(); + while(it.hasNext()) { + it.next(); + for(VirtualColumn vcol : VIRTUAL_COLUMNS) { + if (columnNames.get(i).equals(vcol.getName())) { + it.remove(); + } + } + ++i; + } + return columnTypes; + } + public static StructObjectInspector getVCSObjectInspector(List vcs) { List names = new ArrayList(vcs.size()); List inspectors = new ArrayList(vcs.size()); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java index b87cf74..f232c57 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java @@ -88,6 +88,26 @@ public void testDecimalType() throws Exception { } @Test + public void testCharType() throws Exception { + testConversion( + "a", + "char(5)", + "message hive_schema {\n" + + " optional binary a (UTF8);\n" + + "}\n"); + } + + @Test + public void testVarcharType() throws Exception { + testConversion( + "a", + "varchar(10)", + "message hive_schema {\n" + + " optional binary a (UTF8);\n" + + "}\n"); + } + + @Test public void testArray() throws Exception { testConversion("arrayCol", "array", diff --git ql/src/test/queries/clientnegative/parquet_char.q ql/src/test/queries/clientnegative/parquet_char.q deleted file mode 100644 index 745a786..0000000 --- ql/src/test/queries/clientnegative/parquet_char.q +++ /dev/null @@ -1,3 +0,0 @@ -drop table if exists parquet_char; - -create table parquet_char (t char(10)) stored as parquet; diff --git ql/src/test/queries/clientnegative/parquet_varchar.q ql/src/test/queries/clientnegative/parquet_varchar.q deleted file mode 100644 index 55825f7..0000000 --- ql/src/test/queries/clientnegative/parquet_varchar.q +++ /dev/null @@ -1,3 +0,0 @@ -drop table if exists parquet_varchar; - -create table parquet_varchar (t varchar(10)) stored as parquet; diff --git ql/src/test/queries/clientpositive/parquet_types.q ql/src/test/queries/clientpositive/parquet_types.q index cb0dcfd..86af5af 100644 --- ql/src/test/queries/clientpositive/parquet_types.q +++ ql/src/test/queries/clientpositive/parquet_types.q @@ -8,7 +8,9 @@ CREATE TABLE parquet_types_staging ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'; @@ -19,7 +21,9 @@ CREATE TABLE parquet_types ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) STORED AS PARQUET; LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging; diff --git ql/src/test/results/clientnegative/parquet_char.q.out ql/src/test/results/clientnegative/parquet_char.q.out deleted file mode 100644 index 8c9a52c..0000000 --- ql/src/test/results/clientnegative/parquet_char.q.out +++ /dev/null @@ -1,9 +0,0 @@ -PREHOOK: query: drop table if exists parquet_char -PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table if exists parquet_char -POSTHOOK: type: DROPTABLE -PREHOOK: query: create table parquet_char (t char(10)) stored as parquet -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@parquet_char -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.UnsupportedOperationException: Parquet does not support char. See HIVE-6384 diff --git ql/src/test/results/clientnegative/parquet_timestamp.q.out ql/src/test/results/clientnegative/parquet_timestamp.q.out deleted file mode 100644 index 00973b7..0000000 --- ql/src/test/results/clientnegative/parquet_timestamp.q.out +++ /dev/null @@ -1,8 +0,0 @@ -PREHOOK: query: drop table if exists parquet_timestamp -PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table if exists parquet_timestamp -POSTHOOK: type: DROPTABLE -PREHOOK: query: create table parquet_timestamp (t timestamp) stored as parquet -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.UnsupportedOperationException: Parquet does not support timestamp. See HIVE-6384 diff --git ql/src/test/results/clientnegative/parquet_varchar.q.out ql/src/test/results/clientnegative/parquet_varchar.q.out deleted file mode 100644 index 90f6db2..0000000 --- ql/src/test/results/clientnegative/parquet_varchar.q.out +++ /dev/null @@ -1,9 +0,0 @@ -PREHOOK: query: drop table if exists parquet_varchar -PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table if exists parquet_varchar -POSTHOOK: type: DROPTABLE -PREHOOK: query: create table parquet_varchar (t varchar(10)) stored as parquet -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@parquet_varchar -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.UnsupportedOperationException: Parquet does not support varchar. See HIVE-6384 diff --git ql/src/test/results/clientpositive/parquet_types.q.out ql/src/test/results/clientpositive/parquet_types.q.out index 3acb052..8c4ab18 100644 --- ql/src/test/results/clientpositive/parquet_types.q.out +++ ql/src/test/results/clientpositive/parquet_types.q.out @@ -13,7 +13,9 @@ PREHOOK: query: CREATE TABLE parquet_types_staging ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' PREHOOK: type: CREATETABLE @@ -26,7 +28,9 @@ POSTHOOK: query: CREATE TABLE parquet_types_staging ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' POSTHOOK: type: CREATETABLE @@ -39,7 +43,9 @@ PREHOOK: query: CREATE TABLE parquet_types ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) STORED AS PARQUET PREHOOK: type: CREATETABLE PREHOOK: Output: database:default @@ -51,7 +57,9 @@ POSTHOOK: query: CREATE TABLE parquet_types ( cfloat float, cdouble double, cstring1 string, - t timestamp + t timestamp, + cchar char(5), + cvarchar varchar(10) ) STORED AS PARQUET POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default @@ -72,12 +80,14 @@ POSTHOOK: query: INSERT OVERWRITE TABLE parquet_types SELECT * FROM parquet_type POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_types_staging POSTHOOK: Output: default@parquet_types +POSTHOOK: Lineage: parquet_types.cchar SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cchar, type:char(5), comment:null), ] POSTHOOK: Lineage: parquet_types.cdouble SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cdouble, type:double, comment:null), ] POSTHOOK: Lineage: parquet_types.cfloat SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cfloat, type:float, comment:null), ] POSTHOOK: Lineage: parquet_types.cint SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cint, type:int, comment:null), ] POSTHOOK: Lineage: parquet_types.csmallint SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:csmallint, type:smallint, comment:null), ] POSTHOOK: Lineage: parquet_types.cstring1 SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cstring1, type:string, comment:null), ] POSTHOOK: Lineage: parquet_types.ctinyint SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:ctinyint, type:tinyint, comment:null), ] +POSTHOOK: Lineage: parquet_types.cvarchar SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ] POSTHOOK: Lineage: parquet_types.t SIMPLE [(parquet_types_staging)parquet_types_staging.FieldSchema(name:t, type:timestamp, comment:null), ] PREHOOK: query: SELECT * FROM parquet_types PREHOOK: type: QUERY @@ -87,27 +97,27 @@ POSTHOOK: query: SELECT * FROM parquet_types POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_types #### A masked pattern was here #### -100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 -101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 -102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 -103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 -104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 -105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 -106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 -107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 -108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 -109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 -110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 -111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 -112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 -113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 -114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 -115 1 1 1.0 4.5 tuv 2026-04-04 16:16:16.161616161 -116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 -117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 -118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 -119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 -120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 +100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 a a +101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 ab ab +102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 abc abc +103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 abcd abcd +104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 abcde abcde +105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 abcde abcdef +106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 abcde abcdefg +107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 bcdef abcdefgh +108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 cdefg abcdefghi +109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 klmno abcdedef +110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 pqrst abcdede +111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 nopqr abcded +112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 opqrs abcdd +113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 pqrst abc +114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 qrstu b +115 1 1 1.0 4.5 tuv 2026-04-04 16:16:16.161616161 rstuv abcded +116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 stuvw abcded +117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 tuvwx abcded +118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 uvwzy abcdede +119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 vwxyz abcdede +120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 wxyza abcde PREHOOK: query: SELECT ctinyint, MAX(cint), MIN(csmallint), diff --git serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java index ade3b5f..df27db2 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java +++ serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java @@ -52,7 +52,7 @@ * Deserialize an object out of a Writable blob. In most cases, the return * value of this function will be constant since the function will reuse the * returned object. If the client wants to keep a copy of the object, the - * client needs to clone the returnDeserializered value by calling + * client needs to clone the returned deserialized value by calling * ObjectInspectorUtils.getStandardObject(). * * @param blob diff --git serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java index e7f3f48..70dc181 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java +++ serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java @@ -26,10 +26,11 @@ * Stores information about a type. Always use the TypeInfoFactory to create new * TypeInfo objects. * - * We support 5 categories of types: 1. Primitive objects (String, Number, etc) + * We support 8 categories of types: 1. Primitive objects (String, Number, etc) * 2. List objects (a list of objects of a single type) 3. Map objects (a map * from objects of one type to objects of another type) 4. Struct objects (a * list of fields with names and their own types) 5. Union objects + * 6. Decimal objects 7. Char objects 8. Varchar objects */ public abstract class TypeInfo implements Serializable {