Index: data/files/csv.txt =================================================================== --- data/files/csv.txt (revision 0) +++ data/files/csv.txt (revision 0) @@ -0,0 +1,18 @@ +why hello there,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +another record,98,4,101,9999999,false,99.89,0.00000009,beta,Earth#101,1134:false:wazzup,RED,\N,6:7:8:9:10,54:55:56 +third record,45,5,102,999999999,true,89.99,0.00000000000009,alpha:gamma,Earth#237:Bob#723,102:false:BNL,GREEN,\N,11:12:13,57:58:59 +\N,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,\N,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,\N,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,\N,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,\N,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,\N,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,\N,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,\N,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,\N,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,\N,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,\N,BLUE,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,\N,72,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,\N,0:1:2:3:4:5,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,\N,50:51:53 +string,42,3,100,1412341,true,42.43,85.23423424,alpha:beta:gamma,Earth#42:Control#86:Bob#31,17:true:Abe Linkedin,BLUE,72,0:1:2:3:4:5,\N Index: ql/src/test/queries/clientpositive/avro_nullable_fields.q =================================================================== --- ql/src/test/queries/clientpositive/avro_nullable_fields.q (revision 0) +++ ql/src/test/queries/clientpositive/avro_nullable_fields.q (revision 0) @@ -0,0 +1,59 @@ +-- Verify that nullable fields properly work +CREATE TABLE test_serializer(string1 STRING, + int1 INT, + tinyint1 TINYINT, + smallint1 SMALLINT, + bigint1 BIGINT, + boolean1 BOOLEAN, + float1 FLOAT, + double1 DOUBLE, + list1 ARRAY, + map1 MAP, + struct1 STRUCT, + enum1 STRING, + nullableint INT, + bytes1 ARRAY, + fixed1 ARRAY) + ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY ':' MAP KEYS TERMINATED BY '#' LINES TERMINATED BY '\n' + STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/csv.txt' INTO TABLE test_serializer; + +CREATE TABLE as_avro + ROW FORMAT + SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' + STORED AS + INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' + OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' + TBLPROPERTIES ( + 'avro.schema.literal'='{ + "namespace": "com.howdy", + "name": "some_schema", + "type": "record", + "fields": [ + { "name": "string1", "type": ["null", "string"] }, + { "name": "int1", "type": ["null", "int"] }, + { "name": "tinyint1", "type": ["null", "int"] }, + { "name": "smallint1", "type": ["null", "int"] }, + { "name": "bigint1", "type": ["null", "long"] }, + { "name": "boolean1", "type": ["null", "boolean"] }, + { "name": "float1", "type": ["null", "float"] }, + { "name": "double1", "type": ["null", "double"] }, + { "name": "list1", "type": ["null", {"type": "array", "items": "string"}] }, + { "name": "map1", "type": ["null", {"type": "map", "values": "int"}] }, + { "name": "struct1", "type": ["null", {"type": "record", "name": "struct1_name", "fields": [ + { "name": "sInt", "type": "int" }, + { "name": "sBoolean", "type": "boolean" }, + { "name": "sString", "type": "string" } + ]}] }, + { "name": "enum1", "type": ["null", {"type": "enum", "name": "enum1_values", "symbols": ["BLUE", "RED", "GREEN"]}] }, + { "name": "nullableint", "type": ["null", "int"] }, + { "name": "bytes1", "type": ["null", "bytes"] }, + { "name": "fixed1", "type": ["null", {"type": "fixed", "name": "threebytes", "size": 3}] } + ] + }' + ) +; + +INSERT OVERWRITE TABLE as_avro SELECT * FROM test_serializer; +SELECT * FROM as_avro; Index: ql/src/test/results/clientpositive/avro_nullable_fields.q.out =================================================================== --- ql/src/test/results/clientpositive/avro_nullable_fields.q.out (revision 0) +++ ql/src/test/results/clientpositive/avro_nullable_fields.q.out (revision 0) @@ -0,0 +1,180 @@ +PREHOOK: query: -- Verify that nullable fields properly work +CREATE TABLE test_serializer(string1 STRING, + int1 INT, + tinyint1 TINYINT, + smallint1 SMALLINT, + bigint1 BIGINT, + boolean1 BOOLEAN, + float1 FLOAT, + double1 DOUBLE, + list1 ARRAY, + map1 MAP, + struct1 STRUCT, + enum1 STRING, + nullableint INT, + bytes1 ARRAY, + fixed1 ARRAY) + ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY ':' MAP KEYS TERMINATED BY '#' LINES TERMINATED BY '\n' + STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Verify that nullable fields properly work +CREATE TABLE test_serializer(string1 STRING, + int1 INT, + tinyint1 TINYINT, + smallint1 SMALLINT, + bigint1 BIGINT, + boolean1 BOOLEAN, + float1 FLOAT, + double1 DOUBLE, + list1 ARRAY, + map1 MAP, + struct1 STRUCT, + enum1 STRING, + nullableint INT, + bytes1 ARRAY, + fixed1 ARRAY) + ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY ':' MAP KEYS TERMINATED BY '#' LINES TERMINATED BY '\n' + STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_serializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/csv.txt' INTO TABLE test_serializer +PREHOOK: type: LOAD +PREHOOK: Output: default@test_serializer +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/csv.txt' INTO TABLE test_serializer +POSTHOOK: type: LOAD +POSTHOOK: Output: default@test_serializer +PREHOOK: query: CREATE TABLE as_avro + ROW FORMAT + SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' + STORED AS + INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' + OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' + TBLPROPERTIES ( + 'avro.schema.literal'='{ + "namespace": "com.howdy", + "name": "some_schema", + "type": "record", + "fields": [ + { "name": "string1", "type": ["null", "string"] }, + { "name": "int1", "type": ["null", "int"] }, + { "name": "tinyint1", "type": ["null", "int"] }, + { "name": "smallint1", "type": ["null", "int"] }, + { "name": "bigint1", "type": ["null", "long"] }, + { "name": "boolean1", "type": ["null", "boolean"] }, + { "name": "float1", "type": ["null", "float"] }, + { "name": "double1", "type": ["null", "double"] }, + { "name": "list1", "type": ["null", {"type": "array", "items": "string"}] }, + { "name": "map1", "type": ["null", {"type": "map", "values": "int"}] }, + { "name": "struct1", "type": ["null", {"type": "record", "name": "struct1_name", "fields": [ + { "name": "sInt", "type": "int" }, + { "name": "sBoolean", "type": "boolean" }, + { "name": "sString", "type": "string" } + ]}] }, + { "name": "enum1", "type": ["null", {"type": "enum", "name": "enum1_values", "symbols": ["BLUE", "RED", "GREEN"]}] }, + { "name": "nullableint", "type": ["null", "int"] }, + { "name": "bytes1", "type": ["null", "bytes"] }, + { "name": "fixed1", "type": ["null", {"type": "fixed", "name": "threebytes", "size": 3}] } + ] + }' + ) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE as_avro + ROW FORMAT + SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' + STORED AS + INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' + OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' + TBLPROPERTIES ( + 'avro.schema.literal'='{ + "namespace": "com.howdy", + "name": "some_schema", + "type": "record", + "fields": [ + { "name": "string1", "type": ["null", "string"] }, + { "name": "int1", "type": ["null", "int"] }, + { "name": "tinyint1", "type": ["null", "int"] }, + { "name": "smallint1", "type": ["null", "int"] }, + { "name": "bigint1", "type": ["null", "long"] }, + { "name": "boolean1", "type": ["null", "boolean"] }, + { "name": "float1", "type": ["null", "float"] }, + { "name": "double1", "type": ["null", "double"] }, + { "name": "list1", "type": ["null", {"type": "array", "items": "string"}] }, + { "name": "map1", "type": ["null", {"type": "map", "values": "int"}] }, + { "name": "struct1", "type": ["null", {"type": "record", "name": "struct1_name", "fields": [ + { "name": "sInt", "type": "int" }, + { "name": "sBoolean", "type": "boolean" }, + { "name": "sString", "type": "string" } + ]}] }, + { "name": "enum1", "type": ["null", {"type": "enum", "name": "enum1_values", "symbols": ["BLUE", "RED", "GREEN"]}] }, + { "name": "nullableint", "type": ["null", "int"] }, + { "name": "bytes1", "type": ["null", "bytes"] }, + { "name": "fixed1", "type": ["null", {"type": "fixed", "name": "threebytes", "size": 3}] } + ] + }' + ) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@as_avro +PREHOOK: query: INSERT OVERWRITE TABLE as_avro SELECT * FROM test_serializer +PREHOOK: type: QUERY +PREHOOK: Input: default@test_serializer +PREHOOK: Output: default@as_avro +POSTHOOK: query: INSERT OVERWRITE TABLE as_avro SELECT * FROM test_serializer +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_serializer +POSTHOOK: Output: default@as_avro +POSTHOOK: Lineage: as_avro.bigint1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:bigint1, type:bigint, comment:null), ] +POSTHOOK: Lineage: as_avro.boolean1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:boolean1, type:boolean, comment:null), ] +POSTHOOK: Lineage: as_avro.bytes1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:bytes1, type:array, comment:null), ] +POSTHOOK: Lineage: as_avro.double1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:double1, type:double, comment:null), ] +POSTHOOK: Lineage: as_avro.enum1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:enum1, type:string, comment:null), ] +POSTHOOK: Lineage: as_avro.fixed1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:fixed1, type:array, comment:null), ] +POSTHOOK: Lineage: as_avro.float1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:float1, type:float, comment:null), ] +POSTHOOK: Lineage: as_avro.int1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:int1, type:int, comment:null), ] +POSTHOOK: Lineage: as_avro.list1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:list1, type:array, comment:null), ] +POSTHOOK: Lineage: as_avro.map1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:map1, type:map, comment:null), ] +POSTHOOK: Lineage: as_avro.nullableint SIMPLE [(test_serializer)test_serializer.FieldSchema(name:nullableint, type:int, comment:null), ] +POSTHOOK: Lineage: as_avro.smallint1 EXPRESSION [(test_serializer)test_serializer.FieldSchema(name:smallint1, type:smallint, comment:null), ] +POSTHOOK: Lineage: as_avro.string1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: as_avro.struct1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:struct1, type:struct, comment:null), ] +POSTHOOK: Lineage: as_avro.tinyint1 EXPRESSION [(test_serializer)test_serializer.FieldSchema(name:tinyint1, type:tinyint, comment:null), ] +PREHOOK: query: SELECT * FROM as_avro +PREHOOK: type: QUERY +PREHOOK: Input: default@as_avro +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM as_avro +POSTHOOK: type: QUERY +POSTHOOK: Input: default@as_avro +#### A masked pattern was here #### +POSTHOOK: Lineage: as_avro.bigint1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:bigint1, type:bigint, comment:null), ] +POSTHOOK: Lineage: as_avro.boolean1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:boolean1, type:boolean, comment:null), ] +POSTHOOK: Lineage: as_avro.bytes1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:bytes1, type:array, comment:null), ] +POSTHOOK: Lineage: as_avro.double1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:double1, type:double, comment:null), ] +POSTHOOK: Lineage: as_avro.enum1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:enum1, type:string, comment:null), ] +POSTHOOK: Lineage: as_avro.fixed1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:fixed1, type:array, comment:null), ] +POSTHOOK: Lineage: as_avro.float1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:float1, type:float, comment:null), ] +POSTHOOK: Lineage: as_avro.int1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:int1, type:int, comment:null), ] +POSTHOOK: Lineage: as_avro.list1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:list1, type:array, comment:null), ] +POSTHOOK: Lineage: as_avro.map1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:map1, type:map, comment:null), ] +POSTHOOK: Lineage: as_avro.nullableint SIMPLE [(test_serializer)test_serializer.FieldSchema(name:nullableint, type:int, comment:null), ] +POSTHOOK: Lineage: as_avro.smallint1 EXPRESSION [(test_serializer)test_serializer.FieldSchema(name:smallint1, type:smallint, comment:null), ] +POSTHOOK: Lineage: as_avro.string1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: as_avro.struct1 SIMPLE [(test_serializer)test_serializer.FieldSchema(name:struct1, type:struct, comment:null), ] +POSTHOOK: Lineage: as_avro.tinyint1 EXPRESSION [(test_serializer)test_serializer.FieldSchema(name:tinyint1, type:tinyint, comment:null), ] +why hello there 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +another record 98 4 101 9999999 false 99.89 9.0E-8 ["beta"] {"Earth":101} {"sint":1134,"sboolean":false,"sstring":"wazzup"} RED NULL [6,7,8,9,10] [54,55,56] +third record 45 5 102 999999999 true 89.99 9.0E-14 ["alpha","gamma"] {"Earth":237,"Bob":723} {"sint":102,"sboolean":false,"sstring":"BNL"} GREEN NULL [11,12,13] [57,58,59] +NULL 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string NULL 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 NULL 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 NULL 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 NULL true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 NULL 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true NULL 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 NULL ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 null {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] null {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} null BLUE 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} NULL 72 [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE NULL [0,1,2,3,4,5] [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 null [50,51,53] +string 42 3 100 1412341 true 42.43 85.23423424 ["alpha","beta","gamma"] {"Earth":42,"Bob":31,"Control":86} {"sint":17,"sboolean":true,"sstring":"Abe Linkedin"} BLUE 72 [0,1,2,3,4,5] null Index: serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java (revision 1426606) +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java (working copy) @@ -21,6 +21,7 @@ import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericEnumSymbol; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -49,6 +50,12 @@ class AvroSerializer { private static final Log LOG = LogFactory.getLog(AvroSerializer.class); + /** + * The Schema to use when serializing Map keys. + * Since we're sharing this across Serializer instances, it must be immutable; + * any properties need to be added in a static initializer. + */ + private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING); AvroGenericRecordWritable cache = new AvroGenericRecordWritable(); // Hive is pretty simple (read: stupid) in writing out values via the serializer. @@ -90,6 +97,17 @@ } private Object serialize(TypeInfo typeInfo, ObjectInspector fieldOI, Object structFieldData, Schema schema) throws AvroSerdeException { + if(null == structFieldData) { + return null; + } + if(AvroSerdeUtils.isNullableType(schema)) { + schema = AvroSerdeUtils.getOtherTypeFromNullableType(schema); + } + /* Because we use Hive's 'string' type when Avro calls for enum, we have to expressly check for enum-ness */ + if(Schema.Type.ENUM.equals(schema.getType())) { + assert fieldOI instanceof PrimitiveObjectInspector; + return serializeEnum(typeInfo, (PrimitiveObjectInspector) fieldOI, structFieldData, schema); + } switch(typeInfo.getCategory()) { case PRIMITIVE: assert fieldOI instanceof PrimitiveObjectInspector; @@ -115,6 +133,29 @@ } } + /** private cache to avoid lots of EnumSymbol creation while serializing. + * Two levels because the enum symbol is specific to a schema. + * Object because we want to avoid the overhead of repeated toString calls while maintaining compatability. + * Provided there are few enum types per record, and few symbols per enum, memory use should be moderate. + * eg 20 types with 50 symbols each as length-10 Strings should be on the order of 100KB per AvroSerializer. + */ + final InstanceCache> enums + = new InstanceCache>() { + @Override + protected InstanceCache makeInstance(final Schema schema) { + return new InstanceCache() { + @Override + protected GenericEnumSymbol makeInstance(Object seed) { + return new GenericData.EnumSymbol(schema, seed.toString()); + } + }; + } + }; + + private Object serializeEnum(TypeInfo typeInfo, PrimitiveObjectInspector fieldOI, Object structFieldData, Schema schema) throws AvroSerdeException { + return enums.retrieve(schema).retrieve(serializePrimitive(typeInfo, fieldOI, structFieldData)); + } + private Object serializeStruct(StructTypeInfo typeInfo, StructObjectInspector ssoi, Object o, Schema schema) throws AvroSerdeException { int size = schema.getFields().size(); List allStructFieldRefs = ssoi.getAllStructFieldRefs(); @@ -232,7 +273,7 @@ Map deserialized = new HashMap(fieldOI.getMapSize(structFieldData)); for (Map.Entry entry : map.entrySet()) { - deserialized.put(serialize(mapKeyTypeInfo, mapKeyObjectInspector, entry.getKey(), null), // This works, but is a bit fragile. Construct a single String schema? + deserialized.put(serialize(mapKeyTypeInfo, mapKeyObjectInspector, entry.getKey(), STRING_SCHEMA), serialize(mapValueTypeInfo, mapValueObjectInspector, entry.getValue(), valueType)); } Index: serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroDeserializer.java =================================================================== --- serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroDeserializer.java (revision 1426606) +++ serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroDeserializer.java (working copy) @@ -297,7 +297,7 @@ Schema s = Schema.parse(TestAvroObjectInspectorGenerator.ENUM_SCHEMA); GenericData.Record record = new GenericData.Record(s); - record.put("baddies", "DALEKS"); + record.put("baddies", new GenericData.EnumSymbol(s.getField("baddies").schema(),"DALEKS")); assertTrue(GENERIC_DATA.validate(s, record)); AvroGenericRecordWritable garw = Utils.serializeAndDeserializeRecord(record); @@ -398,13 +398,26 @@ GenericData.Record record = new GenericData.Record(s); record.put("nullableString", "this is a string"); - verifyNullableType(record, s, "this is a string"); + verifyNullableType(record, s, "nullableString", "this is a string"); record = new GenericData.Record(s); record.put("nullableString", null); - verifyNullableType(record, s, null); + verifyNullableType(record, s, "nullableString", null); } + @Test + public void canDeserializeNullableEnums() throws IOException, SerDeException { + Schema s = Schema.parse(TestAvroObjectInspectorGenerator.NULLABLE_ENUM_SCHEMA); + GenericData.Record record = new GenericData.Record(s); + record.put("nullableEnum", new GenericData.EnumSymbol(AvroSerdeUtils.getOtherTypeFromNullableType(s.getField("nullableEnum").schema()), "CYBERMEN")); + + verifyNullableType(record, s, "nullableEnum", "CYBERMEN"); + + record = new GenericData.Record(s); + record.put("nullableEnum", null); + verifyNullableType(record, s, "nullableEnum", null); + } + @Test public void canDeserializeMapWithNullablePrimitiveValues() throws SerDeException, IOException { Schema s = Schema.parse(TestAvroObjectInspectorGenerator.MAP_WITH_NULLABLE_PRIMITIVE_VALUE_TYPE_SCHEMA); @@ -456,7 +469,7 @@ assertEquals(null, theMap2.get("mu")); } - private void verifyNullableType(GenericData.Record record, Schema s, + private void verifyNullableType(GenericData.Record record, Schema s, String fieldName, String expected) throws SerDeException, IOException { assertTrue(GENERIC_DATA.validate(s, record)); @@ -472,13 +485,13 @@ StandardStructObjectInspector oi = (StandardStructObjectInspector)aoig.getObjectInspector(); List fieldsDataAsList = oi.getStructFieldsDataAsList(row); assertEquals(1, fieldsDataAsList.size()); - StructField fieldRef = oi.getStructFieldRef("nullablestring"); + StructField fieldRef = oi.getStructFieldRef(fieldName); ObjectInspector fieldObjectInspector = fieldRef.getFieldObjectInspector(); StringObjectInspector soi = (StringObjectInspector)fieldObjectInspector; if(expected == null) assertNull(soi.getPrimitiveJavaObject(rowElement)); else - assertEquals("this is a string", soi.getPrimitiveJavaObject(rowElement)); + assertEquals(expected, soi.getPrimitiveJavaObject(rowElement)); } } Index: serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroObjectInspectorGenerator.java =================================================================== --- serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroObjectInspectorGenerator.java (revision 1426606) +++ serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroObjectInspectorGenerator.java (working copy) @@ -154,6 +154,20 @@ "\t}\n" + " ]\n" + "}"; + public static final String NULLABLE_ENUM_SCHEMA = "{\n" + + " \"namespace\": \"clever.namespace.name.in.space\",\n" + + " \"name\": \"nullableUnionTest\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"nullableEnum\",\n" + + " \"type\": [\"null\", {\"type\":\"enum\",\"name\":\"villians\", \"symbols\": " + + "[\"DALEKS\", \"CYBERMEN\", \"SLITHEEN\", \"JAGRAFESS\"]}]\n" + + " \n" + + " \n" + + " }\n" + + " ]\n" + + "}"; public static final String BYTES_SCHEMA = "{\n" + " \"type\": \"record\", \n" + " \"name\": \"bytesTest\",\n" + @@ -511,6 +525,23 @@ verifyMap(aoig, "aMap"); } + @Test // That Union[T, NULL] is converted to just T. + public void convertsNullableEnum() throws SerDeException { + Schema s = Schema.parse(NULLABLE_ENUM_SCHEMA); + + AvroObjectInspectorGenerator aoig = new AvroObjectInspectorGenerator(s); + assertEquals(1, aoig.getColumnNames().size()); + assertEquals("nullableEnum", aoig.getColumnNames().get(0)); + + // Column types + assertEquals(1, aoig.getColumnTypes().size()); + TypeInfo typeInfo = aoig.getColumnTypes().get(0); + assertTrue(typeInfo instanceof PrimitiveTypeInfo); + PrimitiveTypeInfo pti = (PrimitiveTypeInfo) typeInfo; + // Verify the union has been hidden and just the main type has been returned. + assertEquals(PrimitiveObjectInspector.PrimitiveCategory.STRING, pti.getPrimitiveCategory()); + } + @Test public void objectInspectorsAreCached() throws SerDeException { // Verify that Hive is caching the object inspectors for us. Index: serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroSerializer.java =================================================================== --- serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroSerializer.java (revision 1426606) +++ serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroSerializer.java (working copy) @@ -20,6 +20,7 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericEnumSymbol; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -30,13 +31,13 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; -import java.util.Hashtable; import java.util.HashMap; import java.util.List; import java.util.Map; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -124,7 +125,7 @@ @Test public void canSerializeMaps() throws SerDeException, IOException { - Map m = new Hashtable(); + Map m = new HashMap(); m.put("yes", true); m.put("no", false); String field = "{ \"name\":\"map1\", \"type\":{\"type\":\"map\", \"values\":\"boolean\"} }"; @@ -187,10 +188,13 @@ private enum enum1 {BLUE, RED , GREEN}; @Test public void canSerializeEnums() throws SerDeException, IOException { + String type = "{\"type\": \"enum\", \"name\": \"enum1_values\", " + + "\"symbols\":[\"BLUE\",\"RED\",\"GREEN\"]}"; + Schema schema = Schema.parse(type); + String field = "{ \"name\":\"enum1\", \"type\": " + schema + " }"; for(enum1 e : enum1.values()) { - String field = "{ \"name\":\"enum1\", \"type\":{\"type\":\"enum\", " + - "\"name\":\"enum1_values\", \"symbols\":[\"BLUE\",\"RED\", \"GREEN\"]} }"; - GenericRecord r = serializeAndDeserialize(field, "enum1", e); + GenericEnumSymbol symbol = new GenericData.EnumSymbol(schema, e.toString()); + GenericRecord r = serializeAndDeserialize(field, "enum1", symbol); assertEquals(e, enum1.valueOf(r.get("enum1").toString())); } @@ -198,7 +202,22 @@ } @Test - public void canSerializeNullableTypes() throws SerDeException, IOException { + public void canSerializeNullableEnums() throws SerDeException, IOException { + String type = "{\"type\": \"enum\", \"name\": \"enum1_values\",\n" + + " \"namespace\": \"org.apache.hadoop.hive\",\n" + + " \"symbols\":[\"BLUE\",\"RED\",\"GREEN\"]}"; + Schema schema = Schema.parse(type); + String field = "{ \"name\":\"nullableenum\", \"type\": [\"null\", " + schema + "] }"; + GenericEnumSymbol symbol = new GenericData.EnumSymbol(schema, enum1.BLUE.toString()); + GenericRecord r = serializeAndDeserialize(field, "nullableenum", symbol); + assertEquals(enum1.BLUE, enum1.valueOf(r.get("nullableenum").toString())); + + r = serializeAndDeserialize(field, "nullableenum", null); + assertNull(r.get("nullableenum")); + } + + @Test + public void canSerializeNullablePrimitiveTypes() throws SerDeException, IOException { String field = "{ \"name\":\"nullableint\", \"type\":[\"int\", \"null\"] }"; GenericRecord r = serializeAndDeserialize(field, "nullableint", 42); assertEquals(42, r.get("nullableint")); @@ -223,6 +242,175 @@ } @Test + public void canSerializeNullableRecords() throws SerDeException, IOException { + String field = "{ \"name\":\"nullableStruct\", \"type\": [\"null\", {\"type\":\"record\", " + + "\"name\":\"struct1_name\", \"fields\": [\n" + + "{ \"name\":\"sInt\", \"type\":\"int\" }, " + + "{ \"name\":\"sBoolean\", \"type\":\"boolean\" }, " + + "{ \"name\":\"sString\", \"type\":\"string\" } ] }] }"; + + Schema s = buildSchema(field); + Schema nullable = s.getField("nullableStruct").schema(); + assertTrue(AvroSerdeUtils.isNullableType(nullable)); + GenericData.Record innerRecord = + new GenericData.Record(AvroSerdeUtils.getOtherTypeFromNullableType(nullable)); + + innerRecord.put("sInt", 77); + innerRecord.put("sBoolean", false); + innerRecord.put("sString", "tedious"); + + GenericRecord r = serializeAndDeserialize(field, "nullableStruct", innerRecord); + Object result = r.get("nullableStruct"); + assertNotSame(innerRecord, result); + assertEquals(innerRecord, result); + + r = serializeAndDeserialize(field, "nullableStruct", null); + assertNull(r.get("nullableStruct")); + } + + @Test + public void canSerializeNullableLists() throws SerDeException, IOException { + List intList = new ArrayList(); + Collections.addAll(intList, 1,2, 3); + String field = "{ \"name\":\"nullableList\", \"type\": [\"null\", " + + "{\"type\":\"array\", \"items\":\"int\"}] }"; + GenericRecord r = serializeAndDeserialize(field, "nullableList", intList); + Object result = r.get("nullableList"); + assertNotSame(intList, result); + assertEquals(intList, result); + + r = serializeAndDeserialize(field, "nullableList", null); + assertNull(r.get("nullableList")); + } + + @Test + public void canSerializeNullableMaps() throws SerDeException, IOException { + String field = "{ \"name\":\"nullableMap\", \"type\": [\"null\", " + + "{\"type\":\"map\", \"values\":\"boolean\"}] }"; + + Map m = new HashMap(); + m.put("yes", true); + m.put("no", false); + GenericRecord r = serializeAndDeserialize(field, "nullableMap", m); + + Object result = r.get("nullableMap"); + assertNotSame(m, result); + assertEquals(m, result); + + r = serializeAndDeserialize(field, "nullableMap", null); + assertNull(r.get("nullableMap")); + } + + @Test + public void canSerializeNullableFixed() throws SerDeException, IOException { + String field = "{ \"name\":\"nullableFixed\", \"type\": [\"null\", " + + "{\"type\":\"fixed\", \"name\":\"threebytes\", \"size\":3}] }"; + Schema s = buildSchema(field); + Schema nullable = s.getField("nullableFixed").schema(); + assertTrue(AvroSerdeUtils.isNullableType(nullable)); + + GenericData.Fixed fixed = new GenericData.Fixed( + AvroSerdeUtils.getOtherTypeFromNullableType(nullable), "k9@".getBytes()); + GenericRecord r = serializeAndDeserialize(field, "nullableFixed", fixed); + + GenericData.Fixed result = (GenericData.Fixed) r.get("nullableFixed"); + assertNotSame(fixed, result); + assertArrayEquals(fixed.bytes(), result.bytes()); + + r = serializeAndDeserialize(field, "nullableFixed", null); + assertNull(r.get("nullableFixed")); + } + + @Test + public void canSerializeNullableBytes() throws SerDeException, IOException { + String field = "{ \"name\":\"nullableBytes\", \"type\":[\"null\", \"bytes\"] }"; + ByteBuffer bb = ByteBuffer.wrap("easy as one two three".getBytes()); + bb.rewind(); + GenericRecord r = serializeAndDeserialize(field, "nullableBytes", bb); + + Object result = r.get("nullableBytes"); + assertNotSame(bb, result); + assertEquals(bb, result); + + r = serializeAndDeserialize(field, "nullableBytes", null); + assertNull(r.get("nullableBytes")); + } + + @Test + public void canSerializeArraysWithNullablePrimitiveElements() throws SerDeException, IOException { + final String field = "{ \"name\":\"listWithNulls\", \"type\": " + + "{\"type\":\"array\", \"items\": [\"null\", \"int\"]} }"; + List intList = new ArrayList(); + Collections.addAll(intList, 1,2, null, 3); + GenericRecord r = serializeAndDeserialize(field, "listWithNulls", intList); + Object result = r.get("listWithNulls"); + assertNotSame(intList, result); + assertEquals(intList, result); + } + + @Test + public void canSerializeArraysWithNullableComplexElements() throws SerDeException, IOException { + final String field = "{ \"name\":\"listOfNullableLists\", \"type\": " + + "{\"type\":\"array\", \"items\": [\"null\", " + + "{\"type\": \"array\", \"items\": \"int\"}]} }"; + List> intListList = new ArrayList>(); + List intList = new ArrayList(); + Collections.addAll(intList, 1,2,3); + Collections.addAll(intListList, intList, null); + GenericRecord r = serializeAndDeserialize(field, "listOfNullableLists", intListList); + Object result = r.get("listOfNullableLists"); + assertNotSame(intListList, result); + assertEquals(intListList, result); + } + + @Test + public void canSerializeRecordsWithNullableComplexElements() throws SerDeException, IOException { + String field = "{ \"name\":\"struct1\", \"type\":{\"type\":\"record\", " + + "\"name\":\"struct1_name\", \"fields\": [\n" + + "{ \"name\":\"sInt\", \"type\":\"int\" }, { \"name\"" + + ":\"sBoolean\", \"type\":\"boolean\" }, { \"name\":\"nullableList\", \"type\":[\"null\", " + + "{ \"type\":\"array\", \"items\":\"int\"}] } ] } }"; + + Schema s = buildSchema(field); + GenericData.Record innerRecord = new GenericData.Record(s.getField("struct1").schema()); + + innerRecord.put("sInt", 77); + innerRecord.put("sBoolean", false); + List intList = new ArrayList(); + Collections.addAll(intList, 1,2,3); + innerRecord.put("nullableList", intList); + + GenericRecord r = serializeAndDeserialize(field, "struct1", innerRecord); + Object result = r.get("struct1"); + assertNotSame(innerRecord, result); + assertEquals(innerRecord, result); + + innerRecord.put("nullableList", null); + r = serializeAndDeserialize(field, "struct1", innerRecord); + result = r.get("struct1"); + assertNotSame(innerRecord, result); + assertEquals(innerRecord, result); + } + + @Test + public void canSerializeMapsWithNullableComplexValues() throws SerDeException, IOException { + String field = "{ \"name\":\"mapWithNullableLists\", \"type\": " + + "{\"type\":\"map\", \"values\": [\"null\", " + + "{\"type\": \"array\", \"items\": \"int\"}]} }"; + + Map> m = new HashMap>(); + List intList = new ArrayList(); + Collections.addAll(intList, 1,2,3); + m.put("list", intList); + m.put("null", null); + GenericRecord r = serializeAndDeserialize(field, "mapWithNullableLists", m); + + Object result = r.get("mapWithNullableLists"); + assertNotSame(m, result); + assertEquals(m, result); + } + + @Test public void canSerializeBytes() throws SerDeException, IOException { String field = "{ \"name\":\"bytes1\", \"type\":\"bytes\" }"; ByteBuffer bb = ByteBuffer.wrap("easy as one two three".getBytes());