diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java index bb02bab..2a7fdf9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java @@ -136,6 +136,13 @@ public Object getStructFieldData(final Object data, final StructField fieldRef) return arr.get()[((StructFieldImpl) fieldRef).getIndex()]; } + //since setStructFieldData and create return a list, getStructFieldData should be able to + //handle list data. This is required when table serde is ParquetHiveSerDe and partition serde + //is something else. + if (data instanceof List) { + return ((List) data).get(((StructFieldImpl) fieldRef).getIndex()); + } + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java index 03e8369..26a6d22 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java @@ -14,8 +14,7 @@ package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaStringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableStringObjectInspector; import org.apache.hadoop.io.Text; @@ -25,10 +24,10 @@ * The ParquetStringInspector inspects a BinaryWritable to give a Text or String. * */ -public class ParquetStringInspector extends AbstractPrimitiveJavaObjectInspector implements SettableStringObjectInspector { +public class ParquetStringInspector extends JavaStringObjectInspector implements SettableStringObjectInspector { ParquetStringInspector() { - super(TypeInfoFactory.stringTypeInfo); + super(); } @Override diff --git ql/src/test/queries/clientpositive/parquet_serde.q ql/src/test/queries/clientpositive/parquet_serde.q new file mode 100644 index 0000000..ce1d7c2 --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_serde.q @@ -0,0 +1,30 @@ +DROP TABLE if exists parquet_mixed_fileformat; + +CREATE TABLE parquet_mixed_fileformat ( + id int, + str string, + part string +) PARTITIONED BY (dateint int) + ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|'; + +---- partition dateint=20140330 is stored as TEXTFILE + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_mixed_fileformat PARTITION (dateint=20140330); + +SELECT * FROM parquet_mixed_fileformat; + +DESCRIBE FORMATTED parquet_mixed_fileformat PARTITION (dateint=20140330); + +---change table serde and file format to PARQUET---- + +ALTER TABLE parquet_mixed_fileformat set SERDE 'parquet.hive.serde.ParquetHiveSerDe'; +ALTER TABLE parquet_mixed_fileformat + SET FILEFORMAT + INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' + OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'; + +DESCRIBE FORMATTED parquet_mixed_fileformat; +DESCRIBE FORMATTED parquet_mixed_fileformat PARTITION (dateint=20140330); + +SELECT * FROM parquet_mixed_fileformat; diff --git ql/src/test/results/clientpositive/parquet_serde.q.out ql/src/test/results/clientpositive/parquet_serde.q.out new file mode 100644 index 0000000..b1e0e8c --- /dev/null +++ ql/src/test/results/clientpositive/parquet_serde.q.out @@ -0,0 +1,212 @@ +PREHOOK: query: DROP TABLE if exists parquet_mixed_fileformat +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists parquet_mixed_fileformat +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_mixed_fileformat ( + id int, + str string, + part string +) PARTITIONED BY (dateint int) + ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_mixed_fileformat ( + id int, + str string, + part string +) PARTITIONED BY (dateint int) + ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_mixed_fileformat +PREHOOK: query: ---- partition dateint=20140330 is stored as TEXTFILE + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_mixed_fileformat PARTITION (dateint=20140330) +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_mixed_fileformat +POSTHOOK: query: ---- partition dateint=20140330 is stored as TEXTFILE + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_mixed_fileformat PARTITION (dateint=20140330) +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_mixed_fileformat +POSTHOOK: Output: default@parquet_mixed_fileformat@dateint=20140330 +PREHOOK: query: SELECT * FROM parquet_mixed_fileformat +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_fileformat +PREHOOK: Input: default@parquet_mixed_fileformat@dateint=20140330 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_mixed_fileformat +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: Input: default@parquet_mixed_fileformat@dateint=20140330 +#### A masked pattern was here #### +1 foo part1 20140330 +2 bar part2 20140330 +3 baz part2 20140330 +PREHOOK: query: DESCRIBE FORMATTED parquet_mixed_fileformat PARTITION (dateint=20140330) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: query: DESCRIBE FORMATTED parquet_mixed_fileformat PARTITION (dateint=20140330) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parquet_mixed_fileformat +# col_name data_type comment + +id int +str string +part string + +# Partition Information +# col_name data_type comment + +dateint int + +# Detailed Partition Information +Partition Value: [20140330] +Database: default +Table: parquet_mixed_fileformat +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 0 + rawDataSize 0 + totalSize 36 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + field.delim | + serialization.format | +PREHOOK: query: ---change table serde and file format to PARQUET---- + +ALTER TABLE parquet_mixed_fileformat set SERDE 'parquet.hive.serde.ParquetHiveSerDe' +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@parquet_mixed_fileformat +PREHOOK: Output: default@parquet_mixed_fileformat +POSTHOOK: query: ---change table serde and file format to PARQUET---- + +ALTER TABLE parquet_mixed_fileformat set SERDE 'parquet.hive.serde.ParquetHiveSerDe' +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: Output: default@parquet_mixed_fileformat +PREHOOK: query: ALTER TABLE parquet_mixed_fileformat + SET FILEFORMAT + INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' + OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' +PREHOOK: type: ALTERTABLE_FILEFORMAT +PREHOOK: Input: default@parquet_mixed_fileformat +PREHOOK: Output: default@parquet_mixed_fileformat +POSTHOOK: query: ALTER TABLE parquet_mixed_fileformat + SET FILEFORMAT + INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' + OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' +POSTHOOK: type: ALTERTABLE_FILEFORMAT +POSTHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: Output: default@parquet_mixed_fileformat +PREHOOK: query: DESCRIBE FORMATTED parquet_mixed_fileformat +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: query: DESCRIBE FORMATTED parquet_mixed_fileformat +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parquet_mixed_fileformat +# col_name data_type comment + +id int +str string +part string + +# Partition Information +# col_name data_type comment + +dateint int + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: parquet.hive.serde.ParquetHiveSerDe +InputFormat: parquet.hive.DeprecatedParquetInputFormat +OutputFormat: parquet.hive.DeprecatedParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + field.delim | + serialization.format | +PREHOOK: query: DESCRIBE FORMATTED parquet_mixed_fileformat PARTITION (dateint=20140330) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: query: DESCRIBE FORMATTED parquet_mixed_fileformat PARTITION (dateint=20140330) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parquet_mixed_fileformat +# col_name data_type comment + +id int +str string +part string + +# Partition Information +# col_name data_type comment + +dateint int + +# Detailed Partition Information +Partition Value: [20140330] +Database: default +Table: parquet_mixed_fileformat +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 0 + rawDataSize 0 + totalSize 36 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + field.delim | + serialization.format | +PREHOOK: query: SELECT * FROM parquet_mixed_fileformat +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_fileformat +PREHOOK: Input: default@parquet_mixed_fileformat@dateint=20140330 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_mixed_fileformat +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_fileformat +POSTHOOK: Input: default@parquet_mixed_fileformat@dateint=20140330 +#### A masked pattern was here #### +1 foo part1 20140330 +2 bar part2 20140330 +3 baz part2 20140330 diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java index e4c274f..2549422 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java @@ -27,7 +27,7 @@ AbstractPrimitiveJavaObjectInspector implements SettableStringObjectInspector { - JavaStringObjectInspector() { + protected JavaStringObjectInspector() { super(TypeInfoFactory.stringTypeInfo); }