diff --git data/files/parquet_array_null_element.txt data/files/parquet_array_null_element.txt new file mode 100644 index 0000000..14046dc --- /dev/null +++ data/files/parquet_array_null_element.txt @@ -0,0 +1,3 @@ +1|,7|CARRELAGE,MOQUETTE|key11:value11,key12:value12,key13:value13 +2|,|CAILLEBOTIS,| +3|,42,||key11:value11,key12:,key13: diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java index 582a5df..052b36d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java @@ -54,6 +54,7 @@ public void start() { if (isMap) { mapPairContainer = new Writable[2]; } + currentValue = null; } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java index b689336..9f2b4aa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -166,7 +166,7 @@ private ArrayWritable createStruct(final Object obj, final StructObjectInspector return new ArrayWritable(Writable.class, arr); } - private Writable createMap(final Object obj, final MapObjectInspector inspector) + private ArrayWritable createMap(final Object obj, final MapObjectInspector inspector) throws SerDeException { final Map sourceMap = inspector.getMap(obj); final ObjectInspector keyInspector = inspector.getMapKeyObjectInspector(); @@ -178,16 +178,12 @@ private Writable createMap(final Object obj, final MapObjectInspector inspector) final Writable key = createObject(keyValue.getKey(), keyInspector); final Writable value = createObject(keyValue.getValue(), valueInspector); if (key != null) { - Writable[] arr = new Writable[2]; - arr[0] = key; - arr[1] = value; - array.add(new ArrayWritable(Writable.class, arr)); + array.add(new ArrayWritable(Writable.class, new Writable[] {key, value})); } } } if (array.size() > 0) { - final ArrayWritable subArray = new ArrayWritable(ArrayWritable.class, - array.toArray(new ArrayWritable[array.size()])); + final ArrayWritable subArray = new ArrayWritable(ArrayWritable.class, array.toArray(new ArrayWritable[array.size()])); return new ArrayWritable(Writable.class, new Writable[] {subArray}); } else { return null; @@ -198,18 +194,14 @@ private ArrayWritable createArray(final Object obj, final ListObjectInspector in throws SerDeException { final List sourceArray = inspector.getList(obj); final ObjectInspector subInspector = inspector.getListElementObjectInspector(); - final List array = new ArrayList(); + final List array = new ArrayList(); if (sourceArray != null) { for (final Object curObj : sourceArray) { - final Writable newObj = createObject(curObj, subInspector); - if (newObj != null) { - array.add(newObj); - } + array.add(new ArrayWritable(Writable.class, new Writable[] {createObject(curObj, subInspector)})); } } if (array.size() > 0) { - final ArrayWritable subArray = new ArrayWritable(array.get(0).getClass(), - array.toArray(new Writable[array.size()])); + final ArrayWritable subArray = new ArrayWritable(ArrayWritable.class, array.toArray(new ArrayWritable[array.size()])); return new ArrayWritable(Writable.class, new Writable[] {subArray}); } else { return null; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java index a98f6be..a3e3fcc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java @@ -74,18 +74,17 @@ private void writeData(final ArrayWritable arr, final GroupType type) { if (fieldType.isPrimitive()) { writePrimitive(value); } else { - recordConsumer.startGroup(); if (value instanceof ArrayWritable) { if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) { writeArray((ArrayWritable) value, fieldType.asGroupType()); } else { + recordConsumer.startGroup(); writeData((ArrayWritable) value, fieldType.asGroupType()); + recordConsumer.endGroup(); } } else if (value != null) { throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value); } - - recordConsumer.endGroup(); } recordConsumer.endField(fieldName, field); @@ -96,32 +95,43 @@ private void writeArray(final ArrayWritable array, final GroupType type) { if (array == null) { return; } - final Writable[] subValues = array.get(); + final Writable[] values = array.get(); final int fieldCount = type.getFieldCount(); - for (int field = 0; field < fieldCount; ++field) { - final Type subType = type.getType(field); - recordConsumer.startField(subType.getName(), field); - for (int i = 0; i < subValues.length; ++i) { - final Writable subValue = subValues[i]; - if (subValue != null) { - if (subType.isPrimitive()) { - if (subValue instanceof ArrayWritable) { - writePrimitive(((ArrayWritable) subValue).get()[field]);// 0 ? - } else { - writePrimitive(subValue); - } - } else { - if (!(subValue instanceof ArrayWritable)) { - throw new RuntimeException("This should be a ArrayWritable: " + subValue); - } else { - recordConsumer.startGroup(); - writeData((ArrayWritable) subValue, subType.asGroupType()); - recordConsumer.endGroup(); + + for (int i = 0; i < values.length; ++i) { + recordConsumer.startGroup(); + Writable value = values[i]; // ArrayWritable of 1 elem if list, or of 2 elems if map + + if (!(value instanceof ArrayWritable)) { + throw new ParquetEncodingException("This should be an ArrayWritable but was: " + value); + } + + for (int field = 0; field < fieldCount; ++field) { + Type subType = type.getType(field); // either array element, map key or map value + + Writable subValue = ((ArrayWritable) value).get()[field]; // either array element, map key or map value + + if (subValue != null) { + recordConsumer.startField(subType.getName(), field); + if (subType.isPrimitive()) { + writePrimitive(subValue); + } else { + recordConsumer.startGroup(); + if (subValue instanceof ArrayWritable) { + if (subType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) { + throw new ParquetEncodingException("REPEATED level can not be found here."); + } else { + writeData((ArrayWritable) subValue, subType.asGroupType()); + } + } else if (subValue != null) { + throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + subValue); + } + recordConsumer.endGroup(); + } + recordConsumer.endField(subType.getName(), field); } - } } - } - recordConsumer.endField(subType.getName(), field); + recordConsumer.endGroup(); } } diff --git ql/src/test/queries/clientpositive/parquet_array_null_element.q ql/src/test/queries/clientpositive/parquet_array_null_element.q new file mode 100644 index 0000000..c14aafd --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_array_null_element.q @@ -0,0 +1,33 @@ +DROP TABLE parquet_array_null_element_staging; +DROP TABLE parquet_array_null_element; + +CREATE TABLE parquet_array_null_element_staging ( + id int, + lstint ARRAY, + lststr ARRAY, + mp MAP +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +NULL DEFINED AS ''; + +CREATE TABLE parquet_array_null_element ( + id int, + lstint ARRAY, + lststr ARRAY, + mp MAP +) STORED AS PARQUET; + +DESCRIBE FORMATTED parquet_array_null_element; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_array_null_element.txt' OVERWRITE INTO TABLE parquet_array_null_element_staging; + +SELECT * FROM parquet_array_null_element_staging; + +INSERT OVERWRITE TABLE parquet_array_null_element SELECT * FROM parquet_array_null_element_staging; + +SELECT lstint from parquet_array_null_element; +SELECT lststr from parquet_array_null_element; +SELECT mp from parquet_array_null_element; +SELECT * FROM parquet_array_null_element; \ No newline at end of file diff --git ql/src/test/results/clientpositive/parquet_array_null_element.q.out ql/src/test/results/clientpositive/parquet_array_null_element.q.out new file mode 100644 index 0000000..fe07035 --- /dev/null +++ ql/src/test/results/clientpositive/parquet_array_null_element.q.out @@ -0,0 +1,174 @@ +PREHOOK: query: DROP TABLE parquet_array_null_element_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_array_null_element_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE parquet_array_null_element +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_array_null_element +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_array_null_element_staging ( + id int, + lstint ARRAY, + lststr ARRAY, + mp MAP +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +NULL DEFINED AS '' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_array_null_element_staging ( + id int, + lstint ARRAY, + lststr ARRAY, + mp MAP +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +NULL DEFINED AS '' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_array_null_element_staging +PREHOOK: query: CREATE TABLE parquet_array_null_element ( + id int, + lstint ARRAY, + lststr ARRAY, + mp MAP +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_array_null_element ( + id int, + lstint ARRAY, + lststr ARRAY, + mp MAP +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_array_null_element +PREHOOK: query: DESCRIBE FORMATTED parquet_array_null_element +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parquet_array_null_element +POSTHOOK: query: DESCRIBE FORMATTED parquet_array_null_element +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parquet_array_null_element +# col_name data_type comment + +id int +lstint array +lststr array +mp map + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_array_null_element.txt' OVERWRITE INTO TABLE parquet_array_null_element_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_array_null_element_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_array_null_element.txt' OVERWRITE INTO TABLE parquet_array_null_element_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_array_null_element_staging +PREHOOK: query: SELECT * FROM parquet_array_null_element_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_array_null_element_staging +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_array_null_element_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_array_null_element_staging +#### A masked pattern was here #### +1 [null,7] ["CARRELAGE","MOQUETTE"] {"key11":"value11","key12":"value12","key13":"value13"} +2 [null,null] ["CAILLEBOTIS",null] NULL +3 [null,42,null] NULL {"key11":"value11","key12":null,"key13":null} +PREHOOK: query: INSERT OVERWRITE TABLE parquet_array_null_element SELECT * FROM parquet_array_null_element_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_array_null_element_staging +PREHOOK: Output: default@parquet_array_null_element +POSTHOOK: query: INSERT OVERWRITE TABLE parquet_array_null_element SELECT * FROM parquet_array_null_element_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_array_null_element_staging +POSTHOOK: Output: default@parquet_array_null_element +POSTHOOK: Lineage: parquet_array_null_element.id SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lstint SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lstint, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lststr SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lststr, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.mp SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:mp, type:map, comment:null), ] +PREHOOK: query: SELECT lstint from parquet_array_null_element +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: query: SELECT lstint from parquet_array_null_element +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_array_null_element.id SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lstint SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lstint, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lststr SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lststr, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.mp SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:mp, type:map, comment:null), ] +[null,7] +[null,null] +[null,42,null] +PREHOOK: query: SELECT lststr from parquet_array_null_element +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: query: SELECT lststr from parquet_array_null_element +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_array_null_element.id SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lstint SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lstint, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lststr SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lststr, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.mp SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:mp, type:map, comment:null), ] +["CARRELAGE","MOQUETTE"] +["CAILLEBOTIS",null] +NULL +PREHOOK: query: SELECT mp from parquet_array_null_element +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: query: SELECT mp from parquet_array_null_element +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_array_null_element.id SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lstint SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lstint, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lststr SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lststr, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.mp SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:mp, type:map, comment:null), ] +{"key12":"value12","key11":"value11","key13":"value13"} +NULL +{"key12":null,"key11":"value11","key13":null} +PREHOOK: query: SELECT * FROM parquet_array_null_element +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_array_null_element +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_array_null_element +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_array_null_element.id SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lstint SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lstint, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.lststr SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:lststr, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_array_null_element.mp SIMPLE [(parquet_array_null_element_staging)parquet_array_null_element_staging.FieldSchema(name:mp, type:map, comment:null), ] +1 [null,7] ["CARRELAGE","MOQUETTE"] {"key12":"value12","key11":"value11","key13":"value13"} +2 [null,null] ["CAILLEBOTIS",null] NULL +3 [null,42,null] NULL {"key12":null,"key11":"value11","key13":null}