diff --git data/files/avro_charvarchar.txt data/files/avro_charvarchar.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e9a86773987be1b859dd0f6bd908c62f883a5ac --- /dev/null +++ data/files/avro_charvarchar.txt @@ -0,0 +1,4 @@ +a |a |k1:v1|101,x200|10,abcdef +ab|ab |k2:v123456|102,y200|10,abc +abc|abc|k3:v1234|103,200|10,a +abcdefghijklm|abcdefghijklmnop|k9:v12|109,200|10, abcdef diff --git ql/src/test/queries/clientpositive/avro_charvarchar.q ql/src/test/queries/clientpositive/avro_charvarchar.q new file mode 100644 index 0000000000000000000000000000000000000000..f36e14377e95f376ccb9d6ad17f761269956a33f --- /dev/null +++ ql/src/test/queries/clientpositive/avro_charvarchar.q @@ -0,0 +1,27 @@ +DROP TABLE avro_charvarchar_staging; +DROP TABLE avro_charvarchar; + +CREATE TABLE avro_charvarchar_staging ( + cchar char(5), + cvarchar varchar(10), + m1 map, + l1 array, + st1 struct +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':'; + +CREATE TABLE avro_charvarchar ( + cchar char(5), + cvarchar varchar(10), + m1 map, + l1 array, + st1 struct +) STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/avro_charvarchar.txt' OVERWRITE INTO TABLE avro_charvarchar_staging; + +INSERT OVERWRITE TABLE avro_charvarchar SELECT * FROM avro_charvarchar_staging; + +SELECT * FROM avro_charvarchar; diff --git ql/src/test/results/clientpositive/avro_charvarchar.q.out ql/src/test/results/clientpositive/avro_charvarchar.q.out new file mode 100644 index 0000000000000000000000000000000000000000..41b5a41352f0126e2c500a6c5b497ef8b21253aa --- /dev/null +++ ql/src/test/results/clientpositive/avro_charvarchar.q.out @@ -0,0 +1,87 @@ +PREHOOK: query: DROP TABLE avro_charvarchar_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE avro_charvarchar_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE avro_charvarchar +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE avro_charvarchar +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE avro_charvarchar_staging ( + cchar char(5), + cvarchar varchar(10), + m1 map, + l1 array, + st1 struct +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@avro_charvarchar_staging +POSTHOOK: query: CREATE TABLE avro_charvarchar_staging ( + cchar char(5), + cvarchar varchar(10), + m1 map, + l1 array, + st1 struct +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_charvarchar_staging +PREHOOK: query: CREATE TABLE avro_charvarchar ( + cchar char(5), + cvarchar varchar(10), + m1 map, + l1 array, + st1 struct +) STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@avro_charvarchar +POSTHOOK: query: CREATE TABLE avro_charvarchar ( + cchar char(5), + cvarchar varchar(10), + m1 map, + l1 array, + st1 struct +) STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_charvarchar +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/avro_charvarchar.txt' OVERWRITE INTO TABLE avro_charvarchar_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@avro_charvarchar_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/avro_charvarchar.txt' OVERWRITE INTO TABLE avro_charvarchar_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@avro_charvarchar_staging +PREHOOK: query: INSERT OVERWRITE TABLE avro_charvarchar SELECT * FROM avro_charvarchar_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_charvarchar_staging +PREHOOK: Output: default@avro_charvarchar +POSTHOOK: query: INSERT OVERWRITE TABLE avro_charvarchar SELECT * FROM avro_charvarchar_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_charvarchar_staging +POSTHOOK: Output: default@avro_charvarchar +POSTHOOK: Lineage: avro_charvarchar.cchar SIMPLE [(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:cchar, type:char(5), comment:null), ] +POSTHOOK: Lineage: avro_charvarchar.cvarchar SIMPLE [(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ] +POSTHOOK: Lineage: avro_charvarchar.l1 SIMPLE [(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:l1, type:array, comment:null), ] +POSTHOOK: Lineage: avro_charvarchar.m1 SIMPLE [(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:m1, type:map, comment:null), ] +POSTHOOK: Lineage: avro_charvarchar.st1 SIMPLE [(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:st1, type:struct, comment:null), ] +PREHOOK: query: SELECT * FROM avro_charvarchar +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_charvarchar +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM avro_charvarchar +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_charvarchar +#### A masked pattern was here #### +a a {"k1":"v1"} ["101","x200"] {"c1":10,"c2":"abcd"} +ab ab {"k2":"v1"} ["102","y200"] {"c1":10,"c2":"abc"} +abc abc {"k3":"v1"} ["103","200"] {"c1":10,"c2":"a "} +abcde abcdefghij {"k9":"v1"} ["109","200"] {"c1":10,"c2":" a"} diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java index 688b0727402c983341faa859bd889d11e95778e9..2fbe00091f12f90a4d5979ca701b30d78b0f1cb0 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java @@ -42,7 +42,9 @@ import org.apache.avro.util.Utf8; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; @@ -200,7 +202,6 @@ private Object worker(Object datum, Schema fileSchema, Schema recordSchema, Type return deserializeNullableUnion(datum, fileSchema, recordSchema, columnType); } - switch(columnType.getCategory()) { case STRUCT: return deserializeStruct((GenericData.Record) datum, fileSchema, (StructTypeInfo) columnType); @@ -249,6 +250,36 @@ private Object deserializePrimitive(Object datum, Schema fileSchema, Schema reco JavaHiveDecimalObjectInspector oi = (JavaHiveDecimalObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector((DecimalTypeInfo)columnType); return oi.set(null, dec); + case CHAR: + if (fileSchema == null) { + throw new AvroSerdeException("File schema is missing for char field. Reader schema is " + columnType); + } + + int maxLength = 0; + try { + maxLength = fileSchema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt(); + } catch (Exception ex) { + throw new AvroSerdeException("Failed to obtain maxLength value for char field from file schema: " + fileSchema, ex); + } + + String str = datum.toString(); + HiveChar hc = new HiveChar(str, maxLength); + return hc; + case VARCHAR: + if (fileSchema == null) { + throw new AvroSerdeException("File schema is missing for varchar field. Reader schema is " + columnType); + } + + maxLength = 0; + try { + maxLength = fileSchema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt(); + } catch (Exception ex) { + throw new AvroSerdeException("Failed to obtain maxLength value for varchar field from file schema: " + fileSchema, ex); + } + + str = datum.toString(); + HiveVarchar hvc = new HiveVarchar(str, maxLength); + return hvc; default: return datum; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java index 69545b046db06fd56f35a0da09d3d6960832484d..a52b2edf08a5799c280cb18d9e98498dbf8aa763 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java @@ -41,9 +41,13 @@ private static final Log LOG = LogFactory.getLog(AvroSerDe.class); public static final String DECIMAL_TYPE_NAME = "decimal"; + public static final String CHAR_TYPE_NAME = "char"; + public static final String VARCHAR_TYPE_NAME = "varchar"; public static final String AVRO_PROP_LOGICAL_TYPE = "logicalType"; public static final String AVRO_PROP_PRECISION = "precision"; public static final String AVRO_PROP_SCALE = "scale"; + public static final String AVRO_PROP_MAX_LENGTH = "maxLength"; + public static final String AVRO_STRING_TYPE_NAME = "string"; private ObjectInspector oi; private List columnNames; diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java index 2bd48ca5cc8a4cc4b30961511668ae29049de3a1..35a7a0ea286610a0ea8dbd7076c8539c66792df9 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java @@ -30,7 +30,9 @@ import org.apache.avro.generic.GenericEnumSymbol; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -193,6 +195,12 @@ private Object serializePrimitive(TypeInfo typeInfo, PrimitiveObjectInspector fi case DECIMAL: HiveDecimal dec = (HiveDecimal)fieldOI.getPrimitiveJavaObject(structFieldData); return AvroSerdeUtils.getBufferFromDecimal(dec, ((DecimalTypeInfo)typeInfo).scale()); + case CHAR: + HiveChar ch = (HiveChar)fieldOI.getPrimitiveJavaObject(structFieldData); + return ch.getStrippedValue(); + case VARCHAR: + HiveVarchar vc = (HiveVarchar)fieldOI.getPrimitiveJavaObject(structFieldData); + return vc.getValue(); case UNKNOWN: throw new AvroSerdeException("Received UNKNOWN primitive category."); case VOID: diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java index 23e024fd2c68cfe0d445ba291c9268430d35c9e0..640b645a70f77f57931f4b707cc7d44a95965608 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java @@ -128,6 +128,28 @@ public static TypeInfo generateTypeInfo(Schema schema) throws AvroSerdeException return TypeInfoFactory.getDecimalTypeInfo(precision, scale); } + if (type == Schema.Type.STRING && + AvroSerDe.CHAR_TYPE_NAME.equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { + int maxLength = 0; + try { + maxLength = schema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt(); + } catch (Exception ex) { + throw new AvroSerdeException("Failed to obtain maxLength value from file schema: " + schema, ex); + } + return TypeInfoFactory.getCharTypeInfo(maxLength); + } + + if (type == Schema.Type.STRING && + AvroSerDe.VARCHAR_TYPE_NAME.equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { + int maxLength = 0; + try { + maxLength = schema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt(); + } catch (Exception ex) { + throw new AvroSerdeException("Failed to obtain maxLength value from file schema: " + schema, ex); + } + return TypeInfoFactory.getVarcharTypeInfo(maxLength); + } + return typeInfoCache.retrieve(schema); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java index 4169558a8467e569bf5adc2433e640c9589d3452..8bf014b7ba95860578d430631c97f21567ea516d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java @@ -19,6 +19,7 @@ import org.apache.avro.Schema; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; @@ -26,6 +27,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.JsonNodeFactory; @@ -105,10 +107,16 @@ private Schema createAvroPrimitive(TypeInfo typeInfo) { schema = Schema.create(Schema.Type.STRING); break; case CHAR: - schema = Schema.create(Schema.Type.STRING); + schema = AvroSerdeUtils.getSchemaFor("{" + + "\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," + + "\"logicalType\":\"" + AvroSerDe.CHAR_TYPE_NAME + "\"," + + "\"maxLength\":" + ((CharTypeInfo) typeInfo).getLength() + "}"); break; case VARCHAR: - schema = Schema.create(Schema.Type.STRING); + schema = AvroSerdeUtils.getSchemaFor("{" + + "\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," + + "\"logicalType\":\"" + AvroSerDe.VARCHAR_TYPE_NAME + "\"," + + "\"maxLength\":" + ((VarcharTypeInfo) typeInfo).getLength() + "}"); break; case BINARY: schema = Schema.create(Schema.Type.BYTES); diff --git serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java index 0f53e31d32137be89c6e7d0bcdc1b184c0964ad9..da58125a60c4ce6b9281147649894d6308d88906 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java +++ serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java @@ -205,6 +205,30 @@ public void createAvroDecimalSchema() { } @Test + public void createAvroCharSchema() { + final String specificSchema = "{" + + "\"type\":\"string\"," + + "\"logicalType\":\"char\"," + + "\"maxLength\":" + CHAR_LEN + "}"; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for char's avro schema failed", + expectedSchema, getAvroSchemaString(CHAR)); + } + + @Test + public void createAvroVarcharSchema() { + final String specificSchema = "{" + + "\"type\":\"string\"," + + "\"logicalType\":\"varchar\"," + + "\"maxLength\":" + CHAR_LEN + "}"; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for varchar's avro schema failed", + expectedSchema, getAvroSchemaString(VARCHAR)); + } + + @Test public void createAvroListSchema() { ListTypeInfo listTypeInfo = new ListTypeInfo(); listTypeInfo.setListElementTypeInfo(STRING); diff --git serde/src/test/resources/avro-struct.avsc serde/src/test/resources/avro-struct.avsc index c8c83d777ca6088723e6f1e5cd6e6547bc6a9bc3..007e6c2838bfcc1c345ce3cd590f74e30fb2e8b1 100644 --- serde/src/test/resources/avro-struct.avsc +++ serde/src/test/resources/avro-struct.avsc @@ -7,8 +7,8 @@ field6:smallint,field7:int,field8:bigint,field9:float,field10:double,field11:boo field12:decimal(4,2),field13:void>", "fields":[ {"name":"field1","type":["null","string"],"doc":"string","default":null}, -{"name":"field2","type":["null","string"],"doc":"char(5)","default":null}, -{"name":"field3","type":["null","string"],"doc":"varchar(5)","default":null}, +{"name":"field2","type":["null",{"type":"string","logicalType":"char","maxLength":5}],"doc":"char(5)","default":null}, +{"name":"field3","type":["null",{"type":"string","logicalType":"varchar","maxLength":5}],"doc":"varchar(5)","default":null}, {"name":"field4","type":["null","bytes"],"doc":"binary","default":null}, {"name":"field5","type":["null","int"],"doc":"tinyint","default":null}, {"name":"field6","type":["null","int"],"doc":"smallint","default":null}, @@ -21,4 +21,4 @@ field12:decimal(4,2),field13:void>", "scale":2}],"doc":"decimal(4,2)","default":null}, {"name":"field13","type":"null","doc":"void","default":null} ] -} \ No newline at end of file +}