diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java index b689336..82816f7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -36,14 +36,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; @@ -55,7 +47,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; - +import parquet.io.ParquetEncodingException; import parquet.io.api.Binary; /** @@ -125,6 +117,16 @@ public Object deserialize(final Writable blob) throws SerDeException { deserializedSize = 0; if (blob instanceof ArrayWritable) { deserializedSize = ((ArrayWritable) blob).get().length; + //in order to support changing column type, we would have to + //inspect each element of array writable. + Writable[] values = ((ArrayWritable) blob).get(); + final List fields = ((StructObjectInspector)objInspector).getAllStructFieldRefs(); + for (int i=0; i:array"); + serDe.initialize(conf, tbl); - final ArrayWritable arrWritable = new ArrayWritable(Writable.class, arr); // Test deserializeAndSerializeLazySimple(serDe, arrWritable); - System.out.println("test: testParquetHiveSerDe - OK"); + } catch (final Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testParquetHiveSerDe3() throws Throwable { + try { + final ParquetHiveSerDe serDe = new ParquetHiveSerDe(); + final Configuration conf = new Configuration(); + final Properties tbl = createProperties(); + //change the string type to be int type, after deserialize, the column would contain null value + //change the bigint type to be int type, after deserialize, the column contains the same numerical value + tbl.setProperty("columns.types", "tinyint:smallint:int:int:double:int:map:array"); + serDe.initialize(conf, tbl); + + // Test + final Object row = serDe.deserialize(arrWritable); + final Writable[] deserized = ((ArrayWritable)row).get(); + assertNull(deserized[5]); + assertEquals(arr[3], deserized[3]); } catch (final Throwable e) { e.printStackTrace(); diff --git ql/src/test/queries/clientpositive/parquet_change_type.q ql/src/test/queries/clientpositive/parquet_change_type.q new file mode 100644 index 0000000..cd7b445 --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_change_type.q @@ -0,0 +1,35 @@ +DROP TABLE if exists parquet_change_type_staging; +DROP TABLE if exists parquet_change_type; + +CREATE TABLE parquet_change_type_staging ( + id int, + str string, + part string +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|'; + +CREATE TABLE parquet_change_type ( + id int, + str string, + part string +) +STORED AS PARQUET; + +--- load staging table + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_change_type_staging; + +SELECT * FROM parquet_change_type_staging; + +--- populate parquet hive table + +INSERT OVERWRITE TABLE parquet_change_type select * FROM parquet_change_type_staging; + +SELECT * FROM parquet_change_type; + +--- change column id from int to bigint + +ALTER TABLE parquet_change_type CHANGE id id bigint; + +SELECT * FROM parquet_change_type; diff --git ql/src/test/results/clientpositive/parquet_change_type.q.out ql/src/test/results/clientpositive/parquet_change_type.q.out new file mode 100644 index 0000000..cf5d4fc --- /dev/null +++ ql/src/test/results/clientpositive/parquet_change_type.q.out @@ -0,0 +1,125 @@ +PREHOOK: query: DROP TABLE if exists parquet_change_type_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists parquet_change_type_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE if exists parquet_change_type +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists parquet_change_type +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_change_type_staging ( + id int, + str string, + part string +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_change_type_staging ( + id int, + str string, + part string +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_change_type_staging +PREHOOK: query: CREATE TABLE parquet_change_type ( + id int, + str string, + part string +) +STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_change_type ( + id int, + str string, + part string +) +STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_change_type +PREHOOK: query: --- load staging table + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_change_type_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_change_type_staging +POSTHOOK: query: --- load staging table + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_change_type_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_change_type_staging +PREHOOK: query: SELECT * FROM parquet_change_type_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_change_type_staging +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_change_type_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_change_type_staging +#### A masked pattern was here #### +1 foo part1 +2 bar part2 +3 baz part2 +PREHOOK: query: --- populate parquet hive table + +INSERT OVERWRITE TABLE parquet_change_type select * FROM parquet_change_type_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_change_type_staging +PREHOOK: Output: default@parquet_change_type +POSTHOOK: query: --- populate parquet hive table + +INSERT OVERWRITE TABLE parquet_change_type select * FROM parquet_change_type_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_change_type_staging +POSTHOOK: Output: default@parquet_change_type +POSTHOOK: Lineage: parquet_change_type.id SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.part SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:part, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.str SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:str, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM parquet_change_type +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_change_type +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_change_type +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_change_type +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_change_type.id SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.part SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:part, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.str SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:str, type:string, comment:null), ] +1 foo part1 +2 bar part2 +3 baz part2 +PREHOOK: query: --- change column id from int to bigint + +ALTER TABLE parquet_change_type CHANGE id id bigint +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@parquet_change_type +PREHOOK: Output: default@parquet_change_type +POSTHOOK: query: --- change column id from int to bigint + +ALTER TABLE parquet_change_type CHANGE id id bigint +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@parquet_change_type +POSTHOOK: Output: default@parquet_change_type +POSTHOOK: Lineage: parquet_change_type.id SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.part SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:part, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.str SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:str, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM parquet_change_type +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_change_type +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_change_type +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_change_type +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_change_type.id SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.part SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:part, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_change_type.str SIMPLE [(parquet_change_type_staging)parquet_change_type_staging.FieldSchema(name:str, type:string, comment:null), ] +1 foo part1 +2 bar part2 +3 baz part2