diff --git ql/src/java/org/apache/hadoop/hive/ql/io/AvroStorageFormatDescriptor.java ql/src/java/org/apache/hadoop/hive/ql/io/AvroStorageFormatDescriptor.java new file mode 100644 index 0000000..66b6573 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/AvroStorageFormatDescriptor.java @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + + +import com.google.common.collect.ImmutableSet; +import org.apache.hadoop.hive.ql.io.AbstractStorageFormatDescriptor; +import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat; +import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat; +import org.apache.hadoop.hive.serde2.avro.AvroSerDe; + +import java.util.Set; + +public class AvroStorageFormatDescriptor extends AbstractStorageFormatDescriptor { + @Override + public Set getNames() { + return ImmutableSet.of(IOConstants.AVRO, IOConstants.AVROFILE); + } + @Override + public String getInputFormat() { + return AvroContainerInputFormat.class.getName(); + } + @Override + public String getOutputFormat() { + return AvroContainerOutputFormat.class.getName(); + } + @Override + public String getSerde() { + return AvroSerDe.class.getName(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java index 1bae0a8..9879dfe 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java @@ -33,6 +33,8 @@ public static final String ORCFILE = "ORCFILE"; public static final String PARQUET = "PARQUET"; public static final String PARQUETFILE = "PARQUETFILE"; + public static final String AVRO = "AVRO"; + public static final String AVROFILE = "AVROFILE"; @VisibleForTesting public static final String CUSTOM_TEXT_SERDE = "CustomTextSerde"; diff --git ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor index a23ff11..d858a95 100644 --- ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor +++ ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor @@ -3,3 +3,4 @@ org.apache.hadoop.hive.ql.io.SequenceFileStorageFormatDescriptor org.apache.hadoop.hive.ql.io.RCFileStorageFormatDescriptor org.apache.hadoop.hive.ql.io.ORCFileStorageFormatDescriptor org.apache.hadoop.hive.ql.io.ParquetFileStorageFormatDescriptor +org.apache.hadoop.hive.ql.io.AvroStorageFormatDescriptor diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java index d53ebc6..3cc8032 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java @@ -38,5 +38,7 @@ public void testNames() { (new ORCFileStorageFormatDescriptor()).getNames()); Assert.assertEquals(Sets.newHashSet(IOConstants.PARQUET, IOConstants.PARQUETFILE), (new ParquetFileStorageFormatDescriptor()).getNames()); + Assert.assertEquals(Sets.newHashSet(IOConstants.AVRO, IOConstants.AVROFILE), + (new AvroStorageFormatDescriptor()).getNames()); } } diff --git ql/src/test/queries/clientpositive/avro_compression_enabled_native.q ql/src/test/queries/clientpositive/avro_compression_enabled_native.q new file mode 100644 index 0000000..99c38ba --- /dev/null +++ ql/src/test/queries/clientpositive/avro_compression_enabled_native.q @@ -0,0 +1,14 @@ +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 ( + number int, + first_name string, + last_name string, + extra_field string) +STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4; + +set hive.exec.compress.output=true; + +SELECT count(*) FROM src; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_decimal_native.q ql/src/test/queries/clientpositive/avro_decimal_native.q new file mode 100644 index 0000000..e856586 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_decimal_native.q @@ -0,0 +1,42 @@ +DROP TABLE IF EXISTS dec; + +CREATE TABLE dec ( + name string, + value decimal(8,4)); + +LOAD DATA LOCAL INPATH '../../data/files/dec.txt' into TABLE dec; + +ANALYZE TABLE dec COMPUTE STATISTICS FOR COLUMNS value; +DESC FORMATTED dec value; + +DROP TABLE IF EXISTS avro_dec; + +CREATE TABLE avro_dec( + name string, + value decimal(5,2)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO; + +DESC avro_dec; + +INSERT OVERWRITE TABLE avro_dec SELECT name, value FROM dec; + +SELECT * FROM avro_dec; + +DROP TABLE IF EXISTS avro_dec1; + +CREATE TABLE avro_dec1( + name string, + value decimal(4,1)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO; + +DESC avro_dec1; + +LOAD DATA LOCAL INPATH '../../data/files/dec.avro' INTO TABLE avro_dec1; + +SELECT value FROM avro_dec1; + +DROP TABLE dec; +DROP TABLE avro_dec; +DROP TABLE avro_dec1; diff --git ql/src/test/queries/clientpositive/avro_joins_native.q ql/src/test/queries/clientpositive/avro_joins_native.q new file mode 100644 index 0000000..ca95c16 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_joins_native.q @@ -0,0 +1,26 @@ +-- SORT_QUERY_RESULTS + +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 ( + number int COMMENT "Order of playing the role", + first_name string COMMENT "first name of actor playing role", + last_name string COMMENT "last name of actor playing role") +STORED AS AVRO; + +DESCRIBE doctors4; + +LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4; + +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO; + +DESCRIBE episodes; + +LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes; + +SELECT e.title, e.air_date, d.first_name, d.last_name, e.air_date +FROM doctors4 d JOIN episodes e ON (d.number=e.doctor); \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_native.q ql/src/test/queries/clientpositive/avro_native.q new file mode 100644 index 0000000..61d1bc6 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_native.q @@ -0,0 +1,14 @@ +-- SORT_QUERY_RESULTS + +-- verify that we can actually read avro files +CREATE TABLE doctors ( + number int, + first_name string, + last_name string) +STORED AS AVRO; + +DESCRIBE doctors; + +LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors; + +SELECT * FROM doctors; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_partitioned_native.q ql/src/test/queries/clientpositive/avro_partitioned_native.q new file mode 100644 index 0000000..0e261b3 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_partitioned_native.q @@ -0,0 +1,29 @@ +-- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes; + +CREATE TABLE episodes_partitioned ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO; + +SET hive.exec.dynamic.partition.mode=nonstrict; +INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) +SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes; + +SELECT * FROM episodes_partitioned WHERE doctor_pt > 6; + +-- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5; +-- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6; +-- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_schema_evolution_native.q ql/src/test/queries/clientpositive/avro_schema_evolution_native.q new file mode 100644 index 0000000..ff155f7 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_schema_evolution_native.q @@ -0,0 +1,62 @@ +-- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes; + +CREATE TABLE episodes_partitioned ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO; + +SET hive.exec.dynamic.partition.mode=nonstrict; +INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) +SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes; + +ALTER TABLE episodes_partitioned +SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +WITH +SERDEPROPERTIES ('avro.schema.literal'='{ + "namespace": "testing.hive.avro.serde", + "name": "episodes", + "type": "record", + "fields": [ + { + "name":"title", + "type":"string", + "doc":"episode title" + }, + { + "name":"air_date", + "type":"string", + "doc":"initial date" + }, + { + "name":"doctor", + "type":"int", + "doc":"main actor playing the Doctor in episode" + }, + { + "name":"value", + "type":"int", + "default":0, + "doc":"default value" + } + ] +}'); + + +SELECT * FROM episodes_partitioned WHERE doctor_pt > 6; + +-- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5; +-- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6; +-- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5; \ No newline at end of file diff --git ql/src/test/results/clientpositive/avro_compression_enabled_native.q.out ql/src/test/results/clientpositive/avro_compression_enabled_native.q.out new file mode 100644 index 0000000..b8fa61e --- /dev/null +++ ql/src/test/results/clientpositive/avro_compression_enabled_native.q.out @@ -0,0 +1,38 @@ +PREHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 ( + number int, + first_name string, + last_name string, + extra_field string) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 ( + number int, + first_name string, + last_name string, + extra_field string) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@doctors4 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@doctors4 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@doctors4 +PREHOOK: query: SELECT count(*) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(*) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 diff --git ql/src/test/results/clientpositive/avro_decimal_native.q.out ql/src/test/results/clientpositive/avro_decimal_native.q.out new file mode 100644 index 0000000..bc87a7d --- /dev/null +++ ql/src/test/results/clientpositive/avro_decimal_native.q.out @@ -0,0 +1,172 @@ +PREHOOK: query: DROP TABLE IF EXISTS dec +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS dec +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE dec ( + name string, + value decimal(8,4)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE dec ( + name string, + value decimal(8,4)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dec +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.txt' into TABLE dec +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@dec +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.txt' into TABLE dec +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@dec +PREHOOK: query: ANALYZE TABLE dec COMPUTE STATISTICS FOR COLUMNS value +PREHOOK: type: QUERY +PREHOOK: Input: default@dec +#### A masked pattern was here #### +POSTHOOK: query: ANALYZE TABLE dec COMPUTE STATISTICS FOR COLUMNS value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dec +#### A masked pattern was here #### +PREHOOK: query: DESC FORMATTED dec value +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@dec +POSTHOOK: query: DESC FORMATTED dec value +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@dec +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value decimal(8,4) -12.25 234.79 0 6 from deserializer +PREHOOK: query: DROP TABLE IF EXISTS avro_dec +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS avro_dec +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE avro_dec( + name string, + value decimal(5,2)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE avro_dec( + name string, + value decimal(5,2)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_dec +PREHOOK: query: DESC avro_dec +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@avro_dec +POSTHOOK: query: DESC avro_dec +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@avro_dec +name string from deserializer +value decimal(5,2) from deserializer +PREHOOK: query: INSERT OVERWRITE TABLE avro_dec SELECT name, value FROM dec +PREHOOK: type: QUERY +PREHOOK: Input: default@dec +PREHOOK: Output: default@avro_dec +POSTHOOK: query: INSERT OVERWRITE TABLE avro_dec SELECT name, value FROM dec +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dec +POSTHOOK: Output: default@avro_dec +POSTHOOK: Lineage: avro_dec.name SIMPLE [(dec)dec.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: avro_dec.value EXPRESSION [(dec)dec.FieldSchema(name:value, type:decimal(8,4), comment:null), ] +PREHOOK: query: SELECT * FROM avro_dec +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_dec +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM avro_dec +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_dec +#### A masked pattern was here #### +Tom 234.79 +Beck 77.34 +Snow 55.71 +Mary 4.33 +Cluck 5.96 +Tom -12.25 +Mary 33.33 +Tom 19 +Beck 0 +Beck 79.9 +PREHOOK: query: DROP TABLE IF EXISTS avro_dec1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS avro_dec1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE avro_dec1( + name string, + value decimal(4,1)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE avro_dec1( + name string, + value decimal(4,1)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_dec1 +PREHOOK: query: DESC avro_dec1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@avro_dec1 +POSTHOOK: query: DESC avro_dec1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@avro_dec1 +name string from deserializer +value decimal(4,1) from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.avro' INTO TABLE avro_dec1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@avro_dec1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.avro' INTO TABLE avro_dec1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@avro_dec1 +PREHOOK: query: SELECT value FROM avro_dec1 +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_dec1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT value FROM avro_dec1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_dec1 +#### A masked pattern was here #### +234.8 +77.3 +55.7 +4.3 +6 +12.3 +33.3 +19 +3.2 +79.9 +PREHOOK: query: DROP TABLE dec +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@dec +PREHOOK: Output: default@dec +POSTHOOK: query: DROP TABLE dec +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@dec +POSTHOOK: Output: default@dec +PREHOOK: query: DROP TABLE avro_dec +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@avro_dec +PREHOOK: Output: default@avro_dec +POSTHOOK: query: DROP TABLE avro_dec +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@avro_dec +POSTHOOK: Output: default@avro_dec +PREHOOK: query: DROP TABLE avro_dec1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@avro_dec1 +PREHOOK: Output: default@avro_dec1 +POSTHOOK: query: DROP TABLE avro_dec1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@avro_dec1 +POSTHOOK: Output: default@avro_dec1 diff --git ql/src/test/results/clientpositive/avro_joins_native.q.out ql/src/test/results/clientpositive/avro_joins_native.q.out new file mode 100644 index 0000000..39d3c97 --- /dev/null +++ ql/src/test/results/clientpositive/avro_joins_native.q.out @@ -0,0 +1,92 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 ( + number int COMMENT "Order of playing the role", + first_name string COMMENT "first name of actor playing role", + last_name string COMMENT "last name of actor playing role") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 ( + number int COMMENT "Order of playing the role", + first_name string COMMENT "first name of actor playing role", + last_name string COMMENT "last name of actor playing role") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@doctors4 +PREHOOK: query: DESCRIBE doctors4 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@doctors4 +POSTHOOK: query: DESCRIBE doctors4 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@doctors4 +number int from deserializer +first_name string from deserializer +last_name string from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@doctors4 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@doctors4 +PREHOOK: query: CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes +PREHOOK: query: DESCRIBE episodes +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@episodes +POSTHOOK: query: DESCRIBE episodes +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@episodes +title string from deserializer +air_date string from deserializer +doctor int from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@episodes +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@episodes +PREHOOK: query: SELECT e.title, e.air_date, d.first_name, d.last_name, e.air_date +FROM doctors4 d JOIN episodes e ON (d.number=e.doctor) +PREHOOK: type: QUERY +PREHOOK: Input: default@doctors4 +PREHOOK: Input: default@episodes +#### A masked pattern was here #### +POSTHOOK: query: SELECT e.title, e.air_date, d.first_name, d.last_name, e.air_date +FROM doctors4 d JOIN episodes e ON (d.number=e.doctor) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@doctors4 +POSTHOOK: Input: default@episodes +#### A masked pattern was here #### +An Unearthly Child 23 November 1963 William Hartnell 23 November 1963 +Castrolava 4 January 1982 Peter Davison 4 January 1982 +Horror of Fang Rock 3 September 1977 Tom Baker 3 September 1977 +Rose 26 March 2005 Christopher Eccleston 26 March 2005 +The Doctor's Wife 14 May 2011 Matt Smith 14 May 2011 +The Eleventh Hour 3 April 2010 Matt Smith 3 April 2010 +The Mysterious Planet 6 September 1986 Colin Baker 6 September 1986 +The Power of the Daleks 5 November 1966 Patrick Troughton 5 November 1966 diff --git ql/src/test/results/clientpositive/avro_native.q.out ql/src/test/results/clientpositive/avro_native.q.out new file mode 100644 index 0000000..68bb9c5 --- /dev/null +++ ql/src/test/results/clientpositive/avro_native.q.out @@ -0,0 +1,57 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that we can actually read avro files +CREATE TABLE doctors ( + number int, + first_name string, + last_name string) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that we can actually read avro files +CREATE TABLE doctors ( + number int, + first_name string, + last_name string) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@doctors +PREHOOK: query: DESCRIBE doctors +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@doctors +POSTHOOK: query: DESCRIBE doctors +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@doctors +number int from deserializer +first_name string from deserializer +last_name string from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@doctors +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@doctors +PREHOOK: query: SELECT * FROM doctors +PREHOOK: type: QUERY +PREHOOK: Input: default@doctors +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM doctors +POSTHOOK: type: QUERY +POSTHOOK: Input: default@doctors +#### A masked pattern was here #### +1 William Hartnell +10 David Tennant +11 Matt Smith +2 Patrick Troughton +3 Jon Pertwee +4 Tom Baker +5 Peter Davison +6 Colin Baker +7 Sylvester McCoy +8 Paul McGann +9 Christopher Eccleston diff --git ql/src/test/results/clientpositive/avro_partitioned_native.q.out ql/src/test/results/clientpositive/avro_partitioned_native.q.out new file mode 100644 index 0000000..3af8d9e --- /dev/null +++ ql/src/test/results/clientpositive/avro_partitioned_native.q.out @@ -0,0 +1,148 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@episodes +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@episodes +PREHOOK: query: CREATE TABLE episodes_partitioned ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE episodes_partitioned ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes_partitioned +PREHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) +SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes +PREHOOK: Output: default@episodes_partitioned +POSTHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) +SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=9 +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +PREHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +Rose 26 March 2005 9 9 +The Doctor's Wife 14 May 2011 11 11 +The Eleventh Hour 3 April 2010 11 11 +PREHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=1 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=2 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=4 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=5 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +An Unearthly Child 23 November 1963 1 1 +Horror of Fang Rock 3 September 1977 4 4 +Rose 26 March 2005 9 9 +The Doctor's Wife 14 May 2011 11 11 +The Eleventh Hour 3 April 2010 11 11 +PREHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +The Mysterious Planet 6 September 1986 6 6 +PREHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/avro_schema_evolution_native.q.out ql/src/test/results/clientpositive/avro_schema_evolution_native.q.out new file mode 100644 index 0000000..f471601 --- /dev/null +++ ql/src/test/results/clientpositive/avro_schema_evolution_native.q.out @@ -0,0 +1,216 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@episodes +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@episodes +PREHOOK: query: CREATE TABLE episodes_partitioned ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE episodes_partitioned ( + title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes_partitioned +PREHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) +SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes +PREHOOK: Output: default@episodes_partitioned +POSTHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) +SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=9 +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +PREHOOK: query: ALTER TABLE episodes_partitioned +SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +WITH +SERDEPROPERTIES ('avro.schema.literal'='{ + "namespace": "testing.hive.avro.serde", + "name": "episodes", + "type": "record", + "fields": [ + { + "name":"title", + "type":"string", + "doc":"episode title" + }, + { + "name":"air_date", + "type":"string", + "doc":"initial date" + }, + { + "name":"doctor", + "type":"int", + "doc":"main actor playing the Doctor in episode" + }, + { + "name":"value", + "type":"int", + "default":0, + "doc":"default value" + } + ] +}') +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Output: default@episodes_partitioned +POSTHOOK: query: ALTER TABLE episodes_partitioned +SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +WITH +SERDEPROPERTIES ('avro.schema.literal'='{ + "namespace": "testing.hive.avro.serde", + "name": "episodes", + "type": "record", + "fields": [ + { + "name":"title", + "type":"string", + "doc":"episode title" + }, + { + "name":"air_date", + "type":"string", + "doc":"initial date" + }, + { + "name":"doctor", + "type":"int", + "doc":"main actor playing the Doctor in episode" + }, + { + "name":"value", + "type":"int", + "default":0, + "doc":"default value" + } + ] +}') +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Output: default@episodes_partitioned +PREHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +Rose 26 March 2005 9 0 9 +The Doctor's Wife 14 May 2011 11 0 11 +The Eleventh Hour 3 April 2010 11 0 11 +PREHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=1 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=2 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=4 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=5 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +An Unearthly Child 23 November 1963 1 0 1 +Horror of Fang Rock 3 September 1977 4 0 4 +Rose 26 March 2005 9 0 9 +The Doctor's Wife 14 May 2011 11 0 11 +The Eleventh Hour 3 April 2010 11 0 11 +PREHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +The Mysterious Planet 6 September 1986 6 0 6 +PREHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java index 0db1243..688b072 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java @@ -262,12 +262,16 @@ private Object deserializeNullableUnion(Object datum, Schema fileSchema, Schema TypeInfo columnType) throws AvroSerdeException { int tag = GenericData.get().resolveUnion(recordSchema, datum); // Determine index of value Schema schema = recordSchema.getTypes().get(tag); - if(schema.getType().equals(Schema.Type.NULL)) { + if (schema.getType().equals(Schema.Type.NULL)) { return null; } - return worker(datum, fileSchema == null ? null : fileSchema.getTypes().get(tag), schema, - SchemaToTypeInfo.generateTypeInfo(schema)); + Schema currentFileSchema = null; + if (fileSchema != null) { + currentFileSchema = + fileSchema.getType() == Type.UNION ? fileSchema.getTypes().get(tag) : fileSchema; + } + return worker(datum, currentFileSchema, schema, SchemaToTypeInfo.generateTypeInfo(schema)); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java index 1fe31e0..69545b0 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hive.serde2.avro; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Properties; @@ -29,6 +31,7 @@ import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.Writable; /** @@ -50,6 +53,8 @@ private AvroSerializer avroSerializer = null; private boolean badSchema = false; + private static String TABLE_NAME = "name"; + private static String TABLE_COMMENT = "comment"; @Override public void initialize(Configuration configuration, Properties tableProperties, @@ -61,17 +66,56 @@ public void initialize(Configuration configuration, Properties tableProperties, @Override public void initialize(Configuration configuration, Properties properties) throws SerDeException { // Reset member variables so we don't get in a half-constructed state - if(schema != null) { + if (schema != null) { LOG.info("Resetting already initialized AvroSerDe"); } schema = null; oi = null; - columnNames = null; + columnNames = null; columnTypes = null; - schema = AvroSerdeUtils.determineSchemaOrReturnErrorSchema(properties); - if(configuration == null) { + final String columnNameProperty = properties.getProperty("columns"); + final String columnTypeProperty = properties.getProperty("columns.types"); + final String columnCommentProperty = properties.getProperty("columns.comments"); + + if (properties.getProperty(AvroSerdeUtils.SCHEMA_LITERAL) != null + || properties.getProperty(AvroSerdeUtils.SCHEMA_URL) != null + || columnNameProperty == null || columnNameProperty.isEmpty() + || columnTypeProperty == null || columnTypeProperty.isEmpty()) { + schema = AvroSerdeUtils.determineSchemaOrReturnErrorSchema(properties); + } else { + // Get column names and sort order + columnNames = Arrays.asList(columnNameProperty.split(",")); + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + + List columnComments; + if (columnCommentProperty.isEmpty()) { + columnComments = new ArrayList(); + } else { + columnComments = Arrays.asList(columnCommentProperty.split(",")); + LOG.info("columnComments is " + columnCommentProperty); + } + if (columnNames.size() != columnTypes.size()) { + throw new IllegalArgumentException("AvroSerde initialization failed. Number of column " + + "name and column type differs. columnNames = " + columnNames + ", columnTypes = " + + columnTypes); + } + + final String tableName = properties.getProperty(TABLE_NAME); + final String tableComment = properties.getProperty(TABLE_COMMENT); + TypeInfoToSchema typeInfoToSchema = new TypeInfoToSchema(); + schema = typeInfoToSchema.convert(columnNames, columnTypes, columnComments, + properties.getProperty(AvroSerdeUtils.SCHEMA_NAMESPACE), + properties.getProperty(AvroSerdeUtils.SCHEMA_NAME, tableName), + properties.getProperty(AvroSerdeUtils.SCHEMA_DOC, tableComment)); + + properties.setProperty(AvroSerdeUtils.SCHEMA_LITERAL, schema.toString()); + } + + LOG.info("Avro schema is " + schema); + + if (configuration == null) { LOG.info("Configuration null, not inserting schema"); } else { configuration.set(AvroSerdeUtils.AVRO_SERDE_SCHEMA, schema.toString(false)); diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java index 4564e75..8c5cf3e 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java @@ -49,6 +49,9 @@ public static final String SCHEMA_LITERAL = "avro.schema.literal"; public static final String SCHEMA_URL = "avro.schema.url"; public static final String SCHEMA_NONE = "none"; + public static final String SCHEMA_NAMESPACE = "avro.schema.namespace"; + public static final String SCHEMA_NAME = "avro.schema.name"; + public static final String SCHEMA_DOC = "avro.schema.doc"; public static final String EXCEPTION_MESSAGE = "Neither " + SCHEMA_LITERAL + " nor " + SCHEMA_URL + " specified, can't determine table schema"; public static final String AVRO_SERDE_SCHEMA = "avro.serde.schema"; diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java new file mode 100644 index 0000000..915f016 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import org.apache.avro.Schema; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Convert Hive TypeInfo to an Avro Schema + */ +public class TypeInfoToSchema { + + private static final Schema.Parser PARSER = new Schema.Parser(); + private long recordCounter = 0; + + /** + * Converts Hive schema to avro schema + * + * @param columnNames Names of the hive columns + * @param columnTypes Hive Column types + * @param namespace Namespace of Avro schema + * @param name Avro schema name + * @param doc Avro schema doc + * @return Avro Schema + */ + public Schema convert(List columnNames, List columnTypes, + List columnComments, String namespace, String name, String doc) { + + List fields = new ArrayList(); + for (int i = 0; i < columnNames.size(); ++i) { + final String comment = columnComments.size() > i ? columnComments.get(i) : null; + final Schema.Field avroField = createAvroField(columnNames.get(i), columnTypes.get(i), + comment); + fields.addAll(getFields(avroField)); + } + + if (name == null || name.isEmpty()) { + name = "baseRecord"; + } + + Schema avroSchema = Schema.createRecord(name, doc, namespace, false); + avroSchema.setFields(fields); + return avroSchema; + } + + private Schema.Field createAvroField(String name, TypeInfo typeInfo, String comment) { + return new Schema.Field(name, createAvroSchema(typeInfo), comment, null); + } + + private Schema createAvroSchema(TypeInfo typeInfo) { + Schema schema = null; + switch (typeInfo.getCategory()) { + case PRIMITIVE: + schema = createAvroPrimitive(typeInfo); + break; + case LIST: + schema = createAvroArray(typeInfo); + break; + case MAP: + schema = createAvroMap(typeInfo); + break; + case STRUCT: + schema = createAvroRecord(typeInfo); + break; + case UNION: + schema = createAvroUnion(typeInfo); + break; + } + + return wrapInUnionWithNull(schema); + } + + private Schema createAvroPrimitive(TypeInfo typeInfo) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + Schema schema; + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case STRING: + schema = Schema.create(Schema.Type.STRING); + break; + case CHAR: + schema = Schema.create(Schema.Type.STRING); + break; + case VARCHAR: + schema = Schema.create(Schema.Type.STRING); + break; + case BINARY: + schema = Schema.create(Schema.Type.BYTES); + break; + case BYTE: + schema = Schema.create(Schema.Type.INT); + break; + case SHORT: + schema = Schema.create(Schema.Type.INT); + break; + case INT: + schema = Schema.create(Schema.Type.INT); + break; + case LONG: + schema = Schema.create(Schema.Type.LONG); + break; + case FLOAT: + schema = Schema.create(Schema.Type.FLOAT); + break; + case DOUBLE: + schema = Schema.create(Schema.Type.DOUBLE); + break; + case BOOLEAN: + schema = Schema.create(Schema.Type.BOOLEAN); + break; + case DECIMAL: + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + String precision = String.valueOf(decimalTypeInfo.precision()); + String scale = String.valueOf(decimalTypeInfo.scale()); + schema = PARSER.parse("{" + + "\"type\":\"bytes\"," + + "\"logicalType\":\"decimal\"," + + "\"precision\":" + precision + "," + + "\"scale\":" + scale + "}"); + break; + case VOID: + schema = Schema.create(Schema.Type.NULL); + break; + default: + throw new UnsupportedOperationException(typeInfo + " is not supported."); + } + return schema; + } + + private Schema createAvroUnion(TypeInfo typeInfo) { + List childSchemas = new ArrayList(); + for (TypeInfo childTypeInfo : ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos()) { + final Schema childSchema = createAvroSchema(childTypeInfo); + if (childSchema.getType() == Schema.Type.UNION) { + childSchemas.addAll(childSchema.getTypes()); + } else { + childSchemas.add(childSchema); + } + } + + return Schema.createUnion(removeDuplicateNullSchemas(childSchemas)); + } + + private Schema createAvroRecord(TypeInfo typeInfo) { + List childFields = new ArrayList(); + + final List allStructFieldNames = + ((StructTypeInfo) typeInfo).getAllStructFieldNames(); + final List allStructFieldTypeInfos = + ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos(); + if (allStructFieldNames.size() != allStructFieldTypeInfos.size()) { + throw new IllegalArgumentException("Failed to generate avro schema from hive schema. " + + "name and column type differs. names = " + allStructFieldNames + ", types = " + + allStructFieldTypeInfos); + } + + for (int i = 0; i < allStructFieldNames.size(); ++i) { + final TypeInfo childTypeInfo = allStructFieldTypeInfos.get(i); + final Schema.Field grandChildSchemaField = createAvroField(allStructFieldNames.get(i), + childTypeInfo, childTypeInfo.toString()); + final List grandChildFields = getFields(grandChildSchemaField); + childFields.addAll(grandChildFields); + } + + Schema recordSchema = Schema.createRecord("record_" + recordCounter, typeInfo.toString(), + null, false); + ++recordCounter; + recordSchema.setFields(childFields); + return recordSchema; + } + + private Schema createAvroMap(TypeInfo typeInfo) { + TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo(); + if (((PrimitiveTypeInfo) keyTypeInfo).getPrimitiveCategory() + != PrimitiveObjectInspector.PrimitiveCategory.STRING) { + throw new UnsupportedOperationException("Key of Map can only be a String"); + } + + TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo(); + Schema valueSchema = createAvroSchema(valueTypeInfo); + + return Schema.createMap(valueSchema); + } + + private Schema createAvroArray(TypeInfo typeInfo) { + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + Schema listSchema = createAvroSchema(listTypeInfo.getListElementTypeInfo()); + return Schema.createArray(listSchema); + } + + private List getFields(Schema.Field schemaField) { + List fields = new ArrayList(); + + if (schemaField.schema().getType() == Schema.Type.RECORD) { + for (Schema.Field field : schemaField.schema().getFields()) { + fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); + } + } else { + fields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), + null)); + } + + return fields; + } + + private Schema wrapInUnionWithNull(Schema schema) { + Schema wrappedSchema = schema; + switch (schema.getType()) { + case NULL: + break; + case UNION: + List existingSchemas = removeDuplicateNullSchemas(schema.getTypes()); + wrappedSchema = Schema.createUnion(existingSchemas); + break; + default: + wrappedSchema = Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), schema)); + } + + return wrappedSchema; + } + + private List removeDuplicateNullSchemas(List childSchemas) { + List prunedSchemas = new ArrayList(); + boolean isNullPresent = false; + for (Schema schema : childSchemas) { + if (schema.getType() == Schema.Type.NULL) { + isNullPresent = true; + } else { + prunedSchemas.add(schema); + } + } + if (isNullPresent) { + prunedSchemas.add(0, Schema.create(Schema.Type.NULL)); + } + + return prunedSchemas; + } +} \ No newline at end of file diff --git serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java new file mode 100644 index 0000000..722bdf9 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java @@ -0,0 +1,371 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.avro; + +import com.google.common.io.Resources; +import org.junit.Assert; +import org.apache.avro.Schema; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.log4j.Logger; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TestTypeInfoToSchema { + + private static Logger LOGGER = Logger.getLogger(TestTypeInfoToSchema.class); + private static final List COLUMN_NAMES = Arrays.asList("testCol"); + private static final TypeInfo STRING = TypeInfoFactory.getPrimitiveTypeInfo("string"); + private static final TypeInfo INT = TypeInfoFactory.getPrimitiveTypeInfo("int"); + private static final TypeInfo BOOLEAN = TypeInfoFactory.getPrimitiveTypeInfo("boolean"); + private static final TypeInfo LONG = TypeInfoFactory.getPrimitiveTypeInfo("bigint"); + private static final TypeInfo FLOAT = TypeInfoFactory.getPrimitiveTypeInfo("float"); + private static final TypeInfo DOUBLE = TypeInfoFactory.getPrimitiveTypeInfo("double"); + private static final TypeInfo BINARY = TypeInfoFactory.getPrimitiveTypeInfo("binary"); + private static final TypeInfo BYTE = TypeInfoFactory.getPrimitiveTypeInfo("tinyint"); + private static final TypeInfo SHORT = TypeInfoFactory.getPrimitiveTypeInfo("smallint"); + private static final TypeInfo VOID = TypeInfoFactory.getPrimitiveTypeInfo("void"); + private static final int PRECISION = 4; + private static final int SCALE = 2; + private static final TypeInfo DECIMAL = TypeInfoFactory.getPrimitiveTypeInfo( + new DecimalTypeInfo(PRECISION, SCALE).getQualifiedName()); + private static final int CHAR_LEN = 5; + private static final TypeInfo CHAR = TypeInfoFactory.getPrimitiveTypeInfo( + new CharTypeInfo(CHAR_LEN).getQualifiedName()); + private static final TypeInfo VARCHAR = TypeInfoFactory.getPrimitiveTypeInfo( + new VarcharTypeInfo(CHAR_LEN).getQualifiedName()); + + private TypeInfoToSchema typeInfoToSchema; + + + private String getAvroSchemaString(TypeInfo columnType) { + return typeInfoToSchema.convert( + COLUMN_NAMES, + Arrays.asList(columnType), + Arrays.asList(""), + "org.apache.hive.avro.testing", + "avrotest", + "This is to test hive-avro").toString(); + } + + private String genSchemaWithoutNull(String specificSchema) { + return "{" + + "\"type\":\"record\"," + + "\"name\":\"avrotest\"," + + "\"namespace\":\"org.apache.hive.avro.testing\"," + + "\"doc\":\"This is to test hive-avro\"," + + "\"fields\":[" + + "{\"name\":\"testCol\"," + + "\"type\":" + specificSchema + "," + + "\"doc\":\"\"}" + + "]}"; + } + + private String genSchema(String specificSchema) { + specificSchema = "[\"null\"," + specificSchema + "]"; + return genSchemaWithoutNull(specificSchema); + } + + @Before + public void setUp() { + typeInfoToSchema = new TypeInfoToSchema(); + } + + @Test + public void createAvroStringSchema() { + final String specificSchema = "\"string\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for string's avro schema failed", + expectedSchema, getAvroSchemaString(STRING)); + } + + @Test + public void createAvroBinarySchema() { + final String specificSchema = "\"bytes\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for binary's avro schema failed", + expectedSchema, getAvroSchemaString(BINARY)); + } + + @Test + public void createAvroBytesSchema() { + final String specificSchema = "\"int\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for bytes's avro schema failed", + expectedSchema, getAvroSchemaString(BYTE)); + } + + @Test + public void createAvroShortSchema() { + final String specificSchema = "\"int\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for short's avro schema failed", + expectedSchema, getAvroSchemaString(SHORT)); + } + + @Test + public void createAvroIntSchema() { + final String specificSchema = "\"int\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for int's avro schema failed", + expectedSchema, getAvroSchemaString(INT)); + } + + @Test + public void createAvroLongSchema() { + final String specificSchema = "\"long\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for long's avro schema failed", + expectedSchema, getAvroSchemaString(LONG)); + } + + @Test + public void createAvroFloatSchema() { + final String specificSchema = "\"float\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for float's avro schema failed", + expectedSchema, getAvroSchemaString(FLOAT)); + } + + @Test + public void createAvroDoubleSchema() { + final String specificSchema = "\"double\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for double's avro schema failed", + expectedSchema, getAvroSchemaString(DOUBLE)); + } + + @Test + public void createAvroBooleanSchema() { + final String specificSchema = "\"boolean\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for boolean's avro schema failed", + expectedSchema, getAvroSchemaString(BOOLEAN)); + } + + @Test + public void createAvroVoidSchema() { + final String specificSchema = "\"null\""; + String expectedSchema = genSchemaWithoutNull(specificSchema); + + Assert.assertEquals("Test for void's avro schema failed", + expectedSchema, getAvroSchemaString(VOID)); + } + + @Test + public void createAvroDecimalSchema() { + final String specificSchema = "{" + + "\"type\":\"bytes\"," + + "\"logicalType\":\"decimal\"," + + "\"precision\":" + PRECISION + "," + + "\"scale\":" + SCALE + "}"; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for decimal's avro schema failed", + expectedSchema, getAvroSchemaString(DECIMAL)); + } + + @Test + public void createAvroListSchema() { + ListTypeInfo listTypeInfo = new ListTypeInfo(); + listTypeInfo.setListElementTypeInfo(STRING); + + final String specificSchema = Schema.createArray(Schema.createUnion(Arrays.asList( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.STRING)))).toString(); + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for list's avro schema failed", + expectedSchema, getAvroSchemaString(listTypeInfo)); + } + + @Test + public void createAvroMapSchema() { + MapTypeInfo mapTypeInfo = new MapTypeInfo(); + mapTypeInfo.setMapKeyTypeInfo(STRING); + mapTypeInfo.setMapValueTypeInfo(INT); + + final String specificSchema = Schema.createMap(Schema.createUnion(Arrays.asList( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.INT)))).toString(); + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for map's avro schema failed", + expectedSchema, getAvroSchemaString(mapTypeInfo)); + } + + @Test + public void createAvroUnionSchema() { + UnionTypeInfo unionTypeInfo = new UnionTypeInfo(); + unionTypeInfo.setAllUnionObjectTypeInfos(Arrays.asList(INT, FLOAT, STRING)); + + final String specificSchema = Schema.createUnion( + Arrays.asList( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.INT), + Schema.create(Schema.Type.FLOAT), + Schema.create(Schema.Type.STRING))).toString(); + String expectedSchema = genSchemaWithoutNull(specificSchema); + + Assert.assertEquals("Test for union's avro schema failed", + expectedSchema, getAvroSchemaString(unionTypeInfo)); + } + + @Test + public void createAvroUnionSchemaOfNull() { + UnionTypeInfo unionTypeInfo = new UnionTypeInfo(); + unionTypeInfo.setAllUnionObjectTypeInfos(Arrays.asList(VOID)); + + final String specificSchema = Schema.createUnion( + Arrays.asList( + Schema.create(Schema.Type.NULL))).toString(); + String expectedSchema = genSchemaWithoutNull(specificSchema); + + Assert.assertEquals("Test for union's avro schema failed", + expectedSchema, getAvroSchemaString(unionTypeInfo)); + } + + @Test + public void createAvroUnionSchemaOfOne() { + UnionTypeInfo unionTypeInfo = new UnionTypeInfo(); + unionTypeInfo.setAllUnionObjectTypeInfos(Arrays.asList(STRING)); + + final String specificSchema = Schema.createUnion( + Arrays.asList( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.STRING))).toString(); + String expectedSchema = genSchemaWithoutNull(specificSchema); + + Assert.assertEquals("Test for union's avro schema failed", + expectedSchema, getAvroSchemaString(unionTypeInfo)); + } + + @Test + public void createAvroUnionSchemaWithNull() { + UnionTypeInfo unionTypeInfo = new UnionTypeInfo(); + unionTypeInfo.setAllUnionObjectTypeInfos(Arrays.asList(INT, FLOAT, STRING, VOID)); + + final String specificSchema = Schema.createUnion( + Arrays.asList( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.INT), + Schema.create(Schema.Type.FLOAT), + Schema.create(Schema.Type.STRING))).toString(); + String expectedSchema = genSchemaWithoutNull(specificSchema); + + Assert.assertEquals("Test for union's avro schema failed", + expectedSchema, getAvroSchemaString(unionTypeInfo)); + } + + @Test + public void createAvroStructSchema() throws IOException { + StructTypeInfo structTypeInfo = new StructTypeInfo(); + ArrayList names = new ArrayList(); + names.add("field1"); + names.add("field2"); + names.add("field3"); + names.add("field4"); + names.add("field5"); + names.add("field6"); + names.add("field7"); + names.add("field8"); + names.add("field9"); + names.add("field10"); + names.add("field11"); + names.add("field12"); + names.add("field13"); + structTypeInfo.setAllStructFieldNames(names); + ArrayList typeInfos = new ArrayList(); + typeInfos.add(STRING); + typeInfos.add(CHAR); + typeInfos.add(VARCHAR); + typeInfos.add(BINARY); + typeInfos.add(BYTE); + typeInfos.add(SHORT); + typeInfos.add(INT); + typeInfos.add(LONG); + typeInfos.add(FLOAT); + typeInfos.add(DOUBLE); + typeInfos.add(BOOLEAN); + typeInfos.add(DECIMAL); + typeInfos.add(VOID); + structTypeInfo.setAllStructFieldTypeInfos(typeInfos); + LOGGER.info("structTypeInfo is " + structTypeInfo); + + final String specificSchema = IOUtils.toString(Resources.getResource("avro-struct.avsc") + .openStream()).replace("\n", ""); + String expectedSchema = genSchema( + specificSchema); + + Assert.assertEquals("Test for struct's avro schema failed", + expectedSchema, getAvroSchemaString(structTypeInfo)); + } + + @Test + public void createAvroNestedStructSchema() throws IOException { + StructTypeInfo structTypeInfo = new StructTypeInfo(); + ArrayList names = new ArrayList(); + names.add("field1"); + names.add("field2"); + structTypeInfo.setAllStructFieldNames(names); + ArrayList typeInfos = new ArrayList(); + typeInfos.add(STRING); + typeInfos.add(INT); + structTypeInfo.setAllStructFieldTypeInfos(typeInfos); + + StructTypeInfo superStructTypeInfo = new StructTypeInfo(); + ArrayList superNames = new ArrayList(); + superNames.add("superfield1"); + superNames.add("superfield2"); + superStructTypeInfo.setAllStructFieldNames(superNames); + ArrayList superTypeInfos = new ArrayList(); + superTypeInfos.add(STRING); + superTypeInfos.add(structTypeInfo); + superStructTypeInfo.setAllStructFieldTypeInfos(superTypeInfos); + + final String specificSchema = IOUtils.toString(Resources.getResource("avro-nested-struct.avsc") + .openStream()).replace("\n", ""); + String expectedSchema = genSchema( + specificSchema); + Assert.assertEquals("Test for nested struct's avro schema failed", + expectedSchema, getAvroSchemaString(superStructTypeInfo)); + } +} \ No newline at end of file diff --git serde/src/test/resources/avro-nested-struct.avsc serde/src/test/resources/avro-nested-struct.avsc new file mode 100644 index 0000000..785af83 --- /dev/null +++ serde/src/test/resources/avro-nested-struct.avsc @@ -0,0 +1,19 @@ +{ +"type":"record", +"name":"record_1", +"namespace":"", +"doc":"struct>", +"fields": +[ +{"name":"superfield1","type":["null","string"],"doc":"string"}, +{"name":"superfield2","type":["null",{"type":"record","name":"record_0", +"doc":"struct", +"fields": +[ +{"name":"field1","type":["null","string"],"doc":"string"}, +{"name":"field2","type":["null","int"],"doc":"int"} +] +} +], +"doc":"struct"}] +} \ No newline at end of file diff --git serde/src/test/resources/avro-struct.avsc serde/src/test/resources/avro-struct.avsc new file mode 100644 index 0000000..313c74f --- /dev/null +++ serde/src/test/resources/avro-struct.avsc @@ -0,0 +1,24 @@ +{ +"type":"record", +"name":"record_0", +"namespace":"", +"doc":"struct", +"fields":[ +{"name":"field1","type":["null","string"],"doc":"string"}, +{"name":"field2","type":["null","string"],"doc":"char(5)"}, +{"name":"field3","type":["null","string"],"doc":"varchar(5)"}, +{"name":"field4","type":["null","bytes"],"doc":"binary"}, +{"name":"field5","type":["null","int"],"doc":"tinyint"}, +{"name":"field6","type":["null","int"],"doc":"smallint"}, +{"name":"field7","type":["null","int"],"doc":"int"}, +{"name":"field8","type":["null","long"],"doc":"bigint"}, +{"name":"field9","type":["null","float"],"doc":"float"}, +{"name":"field10","type":["null","double"],"doc":"double"}, +{"name":"field11","type":["null","boolean"],"doc":"boolean"}, +{"name":"field12","type":["null",{"type":"bytes","logicalType":"decimal","precision":4, +"scale":2}],"doc":"decimal(4,2)"}, +{"name":"field13","type":"null","doc":"void"} +] +} \ No newline at end of file