diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java index b01f21fd99d37e0209987786d64149704c7bf6e2..f3256a7019dcf7f9fffe8c1216e6f94d26bfde91 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -54,7 +54,7 @@ public static MessageType convert(final List columnNames, final List groupFields = getProjectedGroupFields( - groupFieldType, - ((StructTypeInfo) colType).getAllStructFieldNames(), - ((StructTypeInfo) colType).getAllStructFieldTypeInfos() - ); - - Type[] typesArray = groupFields.toArray(new Type[0]); - schemaTypes.add(Types.buildGroup(groupFieldType.getRepetition()) - .addFields(typesArray) - .named(fieldType.getName()) - ); - } else { - schemaTypes.add(fieldType); - } + effectiveSchema = getStructType((StructTypeInfo) colType, fieldType); + } else if (colType.getCategory() == ObjectInspector.Category.LIST) { + effectiveSchema = getListType((ListTypeInfo) colType, fieldType, effectiveSchema); + }//TODO: Add support for other types such as map<> } else { // Add type for schema evolution - schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName)); + effectiveSchema = Types.optional(PrimitiveTypeName.BINARY).named(colName); } + schemaTypes.add(effectiveSchema); } return schemaTypes; } + private static Type getListType(ListTypeInfo colType, Type fieldType, Type effectiveSchema) { + //Parquet has multiple List representations. See + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + // However, Hive supports column projection for "3-level structure" for list + + if (!fieldType.isRepetition(Type.Repetition.REPEATED) + && (fieldType.asGroupType().getType(0).getName().equals(ParquetHiveSerDe.ARRAY.toString()) + || fieldType.asGroupType().getType(0).getName().equals(ParquetHiveSerDe.LIST.toString()))) { + if (colType.getListElementTypeInfo().getCategory() == ObjectInspector.Category.STRUCT) { + //for ARRAY> + String elemNameInFile = ((GroupType) fieldType).getType(0).asGroupType().getType(0).getName(); + StructTypeInfo elemType = (StructTypeInfo) colType.getListElementTypeInfo(); + String secondLevelAnnotatedName = ParquetHiveSerDe.ARRAY.toString(); + if (fieldType.asGroupType().getType(0).getName().equals("list")) { + secondLevelAnnotatedName = ParquetHiveSerDe.LIST.toString(); + } + effectiveSchema = HiveSchemaConverter.listWrapper(fieldType.getName(), OriginalType.LIST, + new GroupType(Type.Repetition.REPEATED, + secondLevelAnnotatedName, HiveSchemaConverter.convertType(elemNameInFile, elemType))); + }//TODO: Add support for other types such as array> + } + return effectiveSchema; + } + + private static Type getStructType(StructTypeInfo colType, Type fieldType) { + Type effectiveSchema; + if (fieldType.isPrimitive()) { + throw new IllegalStateException("Invalid schema data type, found: PRIMITIVE, expected: STRUCT"); + } + + GroupType groupFieldType = fieldType.asGroupType(); + + List groupFields = getProjectedGroupFields( + groupFieldType, + colType.getAllStructFieldNames(), + colType.getAllStructFieldTypeInfos() + ); + + Type[] typesArray = groupFields.toArray(new Type[0]); + effectiveSchema = Types.buildGroup(groupFieldType.getRepetition()) + .addFields(typesArray) + .named(fieldType.getName() + ); + return effectiveSchema; + } + /** * Searchs column names by name on a given Parquet message schema, and returns its projected * Parquet schema types. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java index e1bf8e2d0efdb313b16cb5363d0a0ed35936a0ed..aba181a5c243c4c088799bd53aec565ce58d7a53 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -51,6 +51,7 @@ public static final Text MAP_VALUE = new Text("value"); public static final Text MAP = new Text("map"); public static final Text ARRAY = new Text("bag"); + public static final Text LIST = new Text("list"); // default compression type for parquet output format private static final String DEFAULTCOMPRESSION = diff --git a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q index 193400ff91c3d402258b21922aba982507c82312..0a6debed2f6ee41495f8fec73c72d7ac7c0d7965 100644 --- a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q +++ b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q @@ -24,5 +24,19 @@ CREATE TABLE NewStructFieldTable STORED AS PARQUET AS SELECT * FROM NewStructFie DESCRIBE NewStructFieldTable; SELECT * FROM NewStructFieldTable; +-- test if the order of fields in array> changes, it works file + +DROP TABLE IF EXISTS schema_test; +CREATE TABLE schema_test (msg array>) STORED AS PARQUET; +INSERT INTO TABLE schema_test SELECT stack(2, array(named_struct('f1', 'abc', 'f2', 'abc2')), array(named_struct('f1', +'efg', 'f2', 'efg2'))) FROM NewStructField LIMIT 2; +SELECT * FROM schema_test; +set hive.metastore.disallow.incompatible.col.type.changes=false; +-- Order of fields swapped +ALTER TABLE schema_test CHANGE msg msg array>; +reset hive.metastore.disallow.incompatible.col.type.changes; +SELECT * FROM schema_test; + +DROP TABLE schema_test; DROP TABLE NewStructField; DROP TABLE NewStructFieldTable; diff --git a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out index 4b0711ed03240e9f02f391e03202452d0f80219d..d9e411bf4f14b0c8f715927c79d78bebcccf28b4 100644 --- a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out +++ b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out @@ -123,6 +123,71 @@ POSTHOOK: Input: default@newstructfieldtable {"a1":{"k1":"v1"},"a2":{"e1":5,"e2":null},"a3":null} NULL {"a1":{"k1":"v1"},"a2":{"e1":5,"e2":null},"a3":null} NULL {"a1":{"k1":"v1"},"a2":{"e1":5,"e2":null},"a3":null} NULL +PREHOOK: query: -- test if the order of fields in array> changes, it works file + +DROP TABLE IF EXISTS schema_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: -- test if the order of fields in array> changes, it works file + +DROP TABLE IF EXISTS schema_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE schema_test (msg array>) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@schema_test +POSTHOOK: query: CREATE TABLE schema_test (msg array>) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@schema_test +PREHOOK: query: INSERT INTO TABLE schema_test SELECT stack(2, array(named_struct('f1', 'abc', 'f2', 'abc2')), array(named_struct('f1', +'efg', 'f2', 'efg2'))) FROM NewStructField LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@newstructfield +PREHOOK: Output: default@schema_test +POSTHOOK: query: INSERT INTO TABLE schema_test SELECT stack(2, array(named_struct('f1', 'abc', 'f2', 'abc2')), array(named_struct('f1', +'efg', 'f2', 'efg2'))) FROM NewStructField LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@newstructfield +POSTHOOK: Output: default@schema_test +POSTHOOK: Lineage: schema_test.msg SCRIPT [] +PREHOOK: query: SELECT * FROM schema_test +PREHOOK: type: QUERY +PREHOOK: Input: default@schema_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM schema_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@schema_test +#### A masked pattern was here #### +[{"f1":"efg","f2":"efg2"}] +[{"f1":"abc","f2":"abc2"}] +PREHOOK: query: -- Order of fields swapped +ALTER TABLE schema_test CHANGE msg msg array> +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@schema_test +PREHOOK: Output: default@schema_test +POSTHOOK: query: -- Order of fields swapped +ALTER TABLE schema_test CHANGE msg msg array> +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@schema_test +POSTHOOK: Output: default@schema_test +PREHOOK: query: SELECT * FROM schema_test +PREHOOK: type: QUERY +PREHOOK: Input: default@schema_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM schema_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@schema_test +#### A masked pattern was here #### +[{"f2":"efg2","f1":"efg"}] +[{"f2":"abc2","f1":"abc"}] +PREHOOK: query: DROP TABLE schema_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@schema_test +PREHOOK: Output: default@schema_test +POSTHOOK: query: DROP TABLE schema_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@schema_test +POSTHOOK: Output: default@schema_test PREHOOK: query: DROP TABLE NewStructField PREHOOK: type: DROPTABLE PREHOOK: Input: default@newstructfield diff --git a/ql/src/test/results/clientpositive/parquet_type_promotion.q.out b/ql/src/test/results/clientpositive/parquet_type_promotion.q.out index 55f9b27141c3b5f98c98ab54d959c4ff95465f67..91c3fff57de418db975af18b779bc46e95a15501 100644 --- a/ql/src/test/results/clientpositive/parquet_type_promotion.q.out +++ b/ql/src/test/results/clientpositive/parquet_type_promotion.q.out @@ -233,7 +233,7 @@ POSTHOOK: query: SELECT * FROM arrays_of_struct_to_map POSTHOOK: type: QUERY POSTHOOK: Input: default@arrays_of_struct_to_map #### A masked pattern was here #### -[{"c1":1}] [{"f2":77}] +[{"c1":1}] [{"f2":88}] PREHOOK: query: -- Testing schema evolution of adding columns into array> ALTER TABLE arrays_of_struct_to_map REPLACE COLUMNS (locations1 array>, locations2 array>)