diff --git ql/src/java/org/apache/hadoop/hive/ql/io/AvroStorageFormatDescriptor.java ql/src/java/org/apache/hadoop/hive/ql/io/AvroStorageFormatDescriptor.java new file mode 100644 index 0000000..f8f91af --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/AvroStorageFormatDescriptor.java @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + + +import com.google.common.collect.ImmutableSet; +import org.apache.hadoop.hive.ql.io.AbstractStorageFormatDescriptor; +import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat; +import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat; +import org.apache.hadoop.hive.serde2.avro.AvroSerDe; + +import java.util.Set; + +public class AvroStorageFormatDescriptor extends AbstractStorageFormatDescriptor { + @Override + public Set getNames() { + return ImmutableSet.of(IOConstants.AVRO); + } + @Override + public String getInputFormat() { + return AvroContainerInputFormat.class.getName(); + } + @Override + public String getOutputFormat() { + return AvroContainerOutputFormat.class.getName(); + } + @Override + public String getSerde() { + return AvroSerDe.class.getName(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java index 1bae0a8..9add316 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/IOConstants.java @@ -33,6 +33,7 @@ public static final String ORCFILE = "ORCFILE"; public static final String PARQUET = "PARQUET"; public static final String PARQUETFILE = "PARQUETFILE"; + public static final String AVRO = "AVRO"; @VisibleForTesting public static final String CUSTOM_TEXT_SERDE = "CustomTextSerde"; diff --git ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor index a23ff11..d858a95 100644 --- ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor +++ ql/src/main/resources/META-INF/services/org.apache.hadoop.hive.ql.io.StorageFormatDescriptor @@ -3,3 +3,4 @@ org.apache.hadoop.hive.ql.io.SequenceFileStorageFormatDescriptor org.apache.hadoop.hive.ql.io.RCFileStorageFormatDescriptor org.apache.hadoop.hive.ql.io.ORCFileStorageFormatDescriptor org.apache.hadoop.hive.ql.io.ParquetFileStorageFormatDescriptor +org.apache.hadoop.hive.ql.io.AvroStorageFormatDescriptor diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java index d53ebc6..bdcd8c0 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/TestStorageFormatDescriptor.java @@ -38,5 +38,7 @@ public void testNames() { (new ORCFileStorageFormatDescriptor()).getNames()); Assert.assertEquals(Sets.newHashSet(IOConstants.PARQUET, IOConstants.PARQUETFILE), (new ParquetFileStorageFormatDescriptor()).getNames()); + Assert.assertEquals(Sets.newHashSet(IOConstants.AVRO), + (new AvroStorageFormatDescriptor()).getNames()); } } diff --git ql/src/test/queries/clientpositive/avro_compression_enabled_native.q ql/src/test/queries/clientpositive/avro_compression_enabled_native.q new file mode 100644 index 0000000..c579633 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_compression_enabled_native.q @@ -0,0 +1,14 @@ +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 +(number int, +first_name string, +last_name string, +extra_field string) +STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4; + +set hive.exec.compress.output=true; + +select count(*) from src; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_decimal_native.q ql/src/test/queries/clientpositive/avro_decimal_native.q new file mode 100644 index 0000000..0da1b2a --- /dev/null +++ ql/src/test/queries/clientpositive/avro_decimal_native.q @@ -0,0 +1,40 @@ +DROP TABLE IF EXISTS dec; + +CREATE TABLE dec(name string, value decimal(8,4)); + +LOAD DATA LOCAL INPATH '../../data/files/dec.txt' into TABLE dec; + +ANALYZE TABLE dec COMPUTE STATISTICS FOR COLUMNS value; +DESC FORMATTED dec value; + +DROP TABLE IF EXISTS avro_dec; + +CREATE TABLE avro_dec( + name string, + value decimal(5,2)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO; + +DESC avro_dec; + +INSERT OVERWRITE TABLE avro_dec select name, value from dec; + +SELECT * FROM avro_dec; + +DROP TABLE IF EXISTS avro_dec1; + +CREATE TABLE avro_dec1( + name string, + value decimal(4,1)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO; + +DESC avro_dec1; + +LOAD DATA LOCAL INPATH '../../data/files/dec.avro' into TABLE avro_dec1; + +select value from avro_dec1; + +DROP TABLE dec; +DROP TABLE avro_dec; +DROP TABLE avro_dec1; diff --git ql/src/test/queries/clientpositive/avro_joins_native.q ql/src/test/queries/clientpositive/avro_joins_native.q new file mode 100644 index 0000000..8370212 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_joins_native.q @@ -0,0 +1,26 @@ +-- SORT_QUERY_RESULTS + +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 +(number int COMMENT "Order of playing the role", + first_name string COMMENT "first name of actor playing role", + last_name string COMMENT "last name of actor playing role") +STORED AS AVRO; + +DESCRIBE doctors4; + +LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4; + +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO; + +DESCRIBE episodes; + +LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes; + +SELECT e.title, e.air_date, d.first_name, d.last_name, e.air_date +FROM doctors4 d JOIN episodes e ON (d.number=e.doctor); \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_native.q ql/src/test/queries/clientpositive/avro_native.q new file mode 100644 index 0000000..ee32f52 --- /dev/null +++ ql/src/test/queries/clientpositive/avro_native.q @@ -0,0 +1,14 @@ +-- SORT_QUERY_RESULTS + +-- verify that we can actually read avro files +CREATE TABLE doctors +(number int, +first_name string, +last_name string) +STORED AS AVRO; + +DESCRIBE doctors; + +LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors; + +SELECT * FROM doctors; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_partitioned_native.q ql/src/test/queries/clientpositive/avro_partitioned_native.q new file mode 100644 index 0000000..d005e7e --- /dev/null +++ ql/src/test/queries/clientpositive/avro_partitioned_native.q @@ -0,0 +1,28 @@ +-- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes; + +CREATE TABLE episodes_partitioned +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO; + +SET hive.exec.dynamic.partition.mode=nonstrict; +INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes; + +SELECT * FROM episodes_partitioned WHERE doctor_pt > 6; + +-- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5; +-- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6; +-- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5; \ No newline at end of file diff --git ql/src/test/queries/clientpositive/avro_schema_evolution_native.q ql/src/test/queries/clientpositive/avro_schema_evolution_native.q new file mode 100644 index 0000000..b6ee16e --- /dev/null +++ ql/src/test/queries/clientpositive/avro_schema_evolution_native.q @@ -0,0 +1,61 @@ +-- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO; + +LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes; + +CREATE TABLE episodes_partitioned +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO; + +SET hive.exec.dynamic.partition.mode=nonstrict; +INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes; + +ALTER TABLE episodes_partitioned +SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +WITH +SERDEPROPERTIES ('avro.schema.literal'='{ + "namespace": "testing.hive.avro.serde", + "name": "episodes", + "type": "record", + "fields": [ + { + "name":"title", + "type":"string", + "doc":"episode title" + }, + { + "name":"air_date", + "type":"string", + "doc":"initial date" + }, + { + "name":"doctor", + "type":"int", + "doc":"main actor playing the Doctor in episode" + }, + { + "name":"value", + "type":"int", + "default":0, + "doc":"default value" + } + ] +}'); + + +SELECT * FROM episodes_partitioned WHERE doctor_pt > 6; + +-- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5; +-- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6; +-- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5; \ No newline at end of file diff --git ql/src/test/results/clientpositive/avro_compression_enabled_native.q.out ql/src/test/results/clientpositive/avro_compression_enabled_native.q.out new file mode 100644 index 0000000..08a1944 --- /dev/null +++ ql/src/test/results/clientpositive/avro_compression_enabled_native.q.out @@ -0,0 +1,38 @@ +PREHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 +(number int, +first_name string, +last_name string, +extra_field string) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 +(number int, +first_name string, +last_name string, +extra_field string) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@doctors4 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@doctors4 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@doctors4 +PREHOOK: query: select count(*) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 diff --git ql/src/test/results/clientpositive/avro_decimal_native.q.out ql/src/test/results/clientpositive/avro_decimal_native.q.out new file mode 100644 index 0000000..2d7a7f7 --- /dev/null +++ ql/src/test/results/clientpositive/avro_decimal_native.q.out @@ -0,0 +1,168 @@ +PREHOOK: query: DROP TABLE IF EXISTS dec +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS dec +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE dec(name string, value decimal(8,4)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE dec(name string, value decimal(8,4)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dec +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.txt' into TABLE dec +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@dec +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.txt' into TABLE dec +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@dec +PREHOOK: query: ANALYZE TABLE dec COMPUTE STATISTICS FOR COLUMNS value +PREHOOK: type: QUERY +PREHOOK: Input: default@dec +#### A masked pattern was here #### +POSTHOOK: query: ANALYZE TABLE dec COMPUTE STATISTICS FOR COLUMNS value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dec +#### A masked pattern was here #### +PREHOOK: query: DESC FORMATTED dec value +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@dec +POSTHOOK: query: DESC FORMATTED dec value +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@dec +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value decimal(8,4) -12.25 234.79 0 6 from deserializer +PREHOOK: query: DROP TABLE IF EXISTS avro_dec +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS avro_dec +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE avro_dec( + name string, + value decimal(5,2)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE avro_dec( + name string, + value decimal(5,2)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_dec +PREHOOK: query: DESC avro_dec +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@avro_dec +POSTHOOK: query: DESC avro_dec +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@avro_dec +name string from deserializer +value decimal(5,2) from deserializer +PREHOOK: query: INSERT OVERWRITE TABLE avro_dec select name, value from dec +PREHOOK: type: QUERY +PREHOOK: Input: default@dec +PREHOOK: Output: default@avro_dec +POSTHOOK: query: INSERT OVERWRITE TABLE avro_dec select name, value from dec +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dec +POSTHOOK: Output: default@avro_dec +POSTHOOK: Lineage: avro_dec.name SIMPLE [(dec)dec.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: avro_dec.value EXPRESSION [(dec)dec.FieldSchema(name:value, type:decimal(8,4), comment:null), ] +PREHOOK: query: SELECT * FROM avro_dec +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_dec +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM avro_dec +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_dec +#### A masked pattern was here #### +Tom 234.79 +Beck 77.34 +Snow 55.71 +Mary 4.33 +Cluck 5.96 +Tom -12.25 +Mary 33.33 +Tom 19 +Beck 0 +Beck 79.9 +PREHOOK: query: DROP TABLE IF EXISTS avro_dec1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS avro_dec1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE avro_dec1( + name string, + value decimal(4,1)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE avro_dec1( + name string, + value decimal(4,1)) +COMMENT 'just drop the schema right into the HQL' +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_dec1 +PREHOOK: query: DESC avro_dec1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@avro_dec1 +POSTHOOK: query: DESC avro_dec1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@avro_dec1 +name string from deserializer +value decimal(4,1) from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.avro' into TABLE avro_dec1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@avro_dec1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/dec.avro' into TABLE avro_dec1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@avro_dec1 +PREHOOK: query: select value from avro_dec1 +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_dec1 +#### A masked pattern was here #### +POSTHOOK: query: select value from avro_dec1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_dec1 +#### A masked pattern was here #### +234.8 +77.3 +55.7 +4.3 +6 +12.3 +33.3 +19 +3.2 +79.9 +PREHOOK: query: DROP TABLE dec +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@dec +PREHOOK: Output: default@dec +POSTHOOK: query: DROP TABLE dec +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@dec +POSTHOOK: Output: default@dec +PREHOOK: query: DROP TABLE avro_dec +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@avro_dec +PREHOOK: Output: default@avro_dec +POSTHOOK: query: DROP TABLE avro_dec +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@avro_dec +POSTHOOK: Output: default@avro_dec +PREHOOK: query: DROP TABLE avro_dec1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@avro_dec1 +PREHOOK: Output: default@avro_dec1 +POSTHOOK: query: DROP TABLE avro_dec1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@avro_dec1 +POSTHOOK: Output: default@avro_dec1 diff --git ql/src/test/results/clientpositive/avro_joins_native.q.out ql/src/test/results/clientpositive/avro_joins_native.q.out new file mode 100644 index 0000000..b7fdafc --- /dev/null +++ ql/src/test/results/clientpositive/avro_joins_native.q.out @@ -0,0 +1,92 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 +(number int COMMENT "Order of playing the role", + first_name string COMMENT "first name of actor playing role", + last_name string COMMENT "last name of actor playing role") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that new joins bring in correct schemas (including evolved schemas) + +CREATE TABLE doctors4 +(number int COMMENT "Order of playing the role", + first_name string COMMENT "first name of actor playing role", + last_name string COMMENT "last name of actor playing role") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@doctors4 +PREHOOK: query: DESCRIBE doctors4 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@doctors4 +POSTHOOK: query: DESCRIBE doctors4 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@doctors4 +number int from deserializer +first_name string from deserializer +last_name string from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@doctors4 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors4 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@doctors4 +PREHOOK: query: CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes +PREHOOK: query: DESCRIBE episodes +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@episodes +POSTHOOK: query: DESCRIBE episodes +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@episodes +title string from deserializer +air_date string from deserializer +doctor int from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@episodes +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@episodes +PREHOOK: query: SELECT e.title, e.air_date, d.first_name, d.last_name, e.air_date +FROM doctors4 d JOIN episodes e ON (d.number=e.doctor) +PREHOOK: type: QUERY +PREHOOK: Input: default@doctors4 +PREHOOK: Input: default@episodes +#### A masked pattern was here #### +POSTHOOK: query: SELECT e.title, e.air_date, d.first_name, d.last_name, e.air_date +FROM doctors4 d JOIN episodes e ON (d.number=e.doctor) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@doctors4 +POSTHOOK: Input: default@episodes +#### A masked pattern was here #### +An Unearthly Child 23 November 1963 William Hartnell 23 November 1963 +Castrolava 4 January 1982 Peter Davison 4 January 1982 +Horror of Fang Rock 3 September 1977 Tom Baker 3 September 1977 +Rose 26 March 2005 Christopher Eccleston 26 March 2005 +The Doctor's Wife 14 May 2011 Matt Smith 14 May 2011 +The Eleventh Hour 3 April 2010 Matt Smith 3 April 2010 +The Mysterious Planet 6 September 1986 Colin Baker 6 September 1986 +The Power of the Daleks 5 November 1966 Patrick Troughton 5 November 1966 diff --git ql/src/test/results/clientpositive/avro_native.q.out ql/src/test/results/clientpositive/avro_native.q.out new file mode 100644 index 0000000..7c298b3 --- /dev/null +++ ql/src/test/results/clientpositive/avro_native.q.out @@ -0,0 +1,57 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that we can actually read avro files +CREATE TABLE doctors +(number int, +first_name string, +last_name string) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- verify that we can actually read avro files +CREATE TABLE doctors +(number int, +first_name string, +last_name string) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@doctors +PREHOOK: query: DESCRIBE doctors +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@doctors +POSTHOOK: query: DESCRIBE doctors +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@doctors +number int from deserializer +first_name string from deserializer +last_name string from deserializer +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@doctors +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/doctors.avro' INTO TABLE doctors +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@doctors +PREHOOK: query: SELECT * FROM doctors +PREHOOK: type: QUERY +PREHOOK: Input: default@doctors +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM doctors +POSTHOOK: type: QUERY +POSTHOOK: Input: default@doctors +#### A masked pattern was here #### +1 William Hartnell +10 David Tennant +11 Matt Smith +2 Patrick Troughton +3 Jon Pertwee +4 Tom Baker +5 Peter Davison +6 Colin Baker +7 Sylvester McCoy +8 Paul McGann +9 Christopher Eccleston diff --git ql/src/test/results/clientpositive/avro_partitioned_native.q.out ql/src/test/results/clientpositive/avro_partitioned_native.q.out new file mode 100644 index 0000000..3643e0f --- /dev/null +++ ql/src/test/results/clientpositive/avro_partitioned_native.q.out @@ -0,0 +1,146 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@episodes +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@episodes +PREHOOK: query: CREATE TABLE episodes_partitioned +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE episodes_partitioned +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes_partitioned +PREHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes +PREHOOK: Output: default@episodes_partitioned +POSTHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=9 +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +PREHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +Rose 26 March 2005 9 9 +The Doctor's Wife 14 May 2011 11 11 +The Eleventh Hour 3 April 2010 11 11 +PREHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=1 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=2 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=4 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=5 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +An Unearthly Child 23 November 1963 1 1 +Horror of Fang Rock 3 September 1977 4 4 +Rose 26 March 2005 9 9 +The Doctor's Wife 14 May 2011 11 11 +The Eleventh Hour 3 April 2010 11 11 +PREHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +The Mysterious Planet 6 September 1986 6 6 +PREHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/avro_schema_evolution_native.q.out ql/src/test/results/clientpositive/avro_schema_evolution_native.q.out new file mode 100644 index 0000000..e9e0aea --- /dev/null +++ ql/src/test/results/clientpositive/avro_schema_evolution_native.q.out @@ -0,0 +1,214 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS +-- Verify that table scans work with partitioned Avro tables +CREATE TABLE episodes +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@episodes +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/episodes.avro' INTO TABLE episodes +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@episodes +PREHOOK: query: CREATE TABLE episodes_partitioned +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE episodes_partitioned +(title string COMMENT "episode title", + air_date string COMMENT "initial date", + doctor int COMMENT "main actor playing the Doctor in episode") +PARTITIONED BY (doctor_pt INT) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@episodes_partitioned +PREHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes +PREHOOK: Output: default@episodes_partitioned +POSTHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Output: default@episodes_partitioned@doctor_pt=9 +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ] +PREHOOK: query: ALTER TABLE episodes_partitioned +SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +WITH +SERDEPROPERTIES ('avro.schema.literal'='{ + "namespace": "testing.hive.avro.serde", + "name": "episodes", + "type": "record", + "fields": [ + { + "name":"title", + "type":"string", + "doc":"episode title" + }, + { + "name":"air_date", + "type":"string", + "doc":"initial date" + }, + { + "name":"doctor", + "type":"int", + "doc":"main actor playing the Doctor in episode" + }, + { + "name":"value", + "type":"int", + "default":0, + "doc":"default value" + } + ] +}') +PREHOOK: type: ALTERTABLE_SERIALIZER +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Output: default@episodes_partitioned +POSTHOOK: query: ALTER TABLE episodes_partitioned +SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +WITH +SERDEPROPERTIES ('avro.schema.literal'='{ + "namespace": "testing.hive.avro.serde", + "name": "episodes", + "type": "record", + "fields": [ + { + "name":"title", + "type":"string", + "doc":"episode title" + }, + { + "name":"air_date", + "type":"string", + "doc":"initial date" + }, + { + "name":"doctor", + "type":"int", + "doc":"main actor playing the Doctor in episode" + }, + { + "name":"value", + "type":"int", + "default":0, + "doc":"default value" + } + ] +}') +POSTHOOK: type: ALTERTABLE_SERIALIZER +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Output: default@episodes_partitioned +PREHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +Rose 26 March 2005 9 0 9 +The Doctor's Wife 14 May 2011 11 0 11 +The Eleventh Hour 3 April 2010 11 0 11 +PREHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=1 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=11 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=2 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=4 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=5 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +PREHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +POSTHOOK: query: -- Verify that Fetch works in addition to Map +SELECT * FROM episodes_partitioned ORDER BY air_date LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=1 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=2 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=4 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=5 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9 +#### A masked pattern was here #### +An Unearthly Child 23 November 1963 1 0 1 +Horror of Fang Rock 3 September 1977 4 0 4 +Rose 26 March 2005 9 0 9 +The Doctor's Wife 14 May 2011 11 0 11 +The Eleventh Hour 3 April 2010 11 0 11 +PREHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +PREHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/filter to specific partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +POSTHOOK: Input: default@episodes_partitioned@doctor_pt=6 +#### A masked pattern was here #### +The Mysterious Planet 6 September 1986 6 0 6 +PREHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### +POSTHOOK: query: -- Fetch w/non-existent partition +SELECT * FROM episodes_partitioned WHERE doctor_pt = 7 LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@episodes_partitioned +#### A masked pattern was here #### diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java index 1fe31e0..f24fa44 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hive.serde2.avro; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Properties; @@ -29,6 +31,7 @@ import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.Writable; /** @@ -61,17 +64,53 @@ public void initialize(Configuration configuration, Properties tableProperties, @Override public void initialize(Configuration configuration, Properties properties) throws SerDeException { // Reset member variables so we don't get in a half-constructed state - if(schema != null) { + if (schema != null) { LOG.info("Resetting already initialized AvroSerDe"); } schema = null; oi = null; - columnNames = null; + columnNames = null; columnTypes = null; - schema = AvroSerdeUtils.determineSchemaOrReturnErrorSchema(properties); - if(configuration == null) { + final String columnNameProperty = properties.getProperty("columns"); + final String columnTypeProperty = properties.getProperty("columns.types"); + + if (properties.getProperty(AvroSerdeUtils.SCHEMA_LITERAL) != null + || properties.getProperty(AvroSerdeUtils.SCHEMA_URL) != null + || columnNameProperty == null || columnNameProperty.length() == 0 + || columnTypeProperty == null || columnTypeProperty.length() == 0) { + schema = AvroSerdeUtils.determineSchemaOrReturnErrorSchema(properties); + } else { + // Get column names and sort order + if (columnNameProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(",")); + } + if (columnTypeProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + if (columnNames.size() != columnTypes.size()) { + throw new IllegalArgumentException("AvroSerde initialization failed. Number of column " + + "name and column type differs. columnNames = " + columnNames + ", columnTypes = " + + columnTypes); + } + + TypeInfoToSchema typeInfoToSchema = new TypeInfoToSchema(); + schema = typeInfoToSchema.convert(columnNames, columnTypes, + properties.getProperty("avro.schema.namespace"), + properties.getProperty("avro.schema.name"), + properties.getProperty("avro.schema.doc")); + + properties.setProperty(AvroSerdeUtils.SCHEMA_LITERAL, schema.toString()); + } + + LOG.info("Schema is " + schema); + + if (configuration == null) { LOG.info("Configuration null, not inserting schema"); } else { configuration.set(AvroSerdeUtils.AVRO_SERDE_SCHEMA, schema.toString(false)); diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java new file mode 100644 index 0000000..8d00b88 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import org.apache.avro.Schema; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; + +import java.util.ArrayList; +import java.util.List; + +public class TypeInfoToSchema { + + private static final Schema.Parser PARSER = new Schema.Parser(); + + /** + * Converts Hive schema to avro schema + * + * @param columnNames Names of the hive columns + * @param columnTypes Hive Column types + * @param namespace Namespace of Avro schema + * @param name Avro schema name + * @param doc Avro schema doc + * @return Avro Schema + */ + public Schema convert(List columnNames, List columnTypes, + String namespace, String name, String doc) { + + List fields = new ArrayList(); + for (int i = 0; i < columnNames.size(); ++i) { + final Schema.Field avroField = createAvroField(columnNames.get(i), columnTypes.get(i)); + fields.addAll(getFields(columnTypes.get(i), avroField)); + } + + if (name == null || name.isEmpty()) { + name = "baseRecord"; + } + + Schema avroSchema = Schema.createRecord(name, doc, namespace, false); + avroSchema.setFields(fields); + return avroSchema; + } + + private Schema.Field createAvroField(String name, TypeInfo typeInfo) { + return new Schema.Field(name, createAvroSchema(typeInfo), "", null); + } + + private Schema createAvroSchema(TypeInfo typeInfo) { + Schema schema = null; + switch (typeInfo.getCategory()) { + case PRIMITIVE: + schema = createAvroPrimitive(typeInfo); + break; + case LIST: + schema = createAvroArray(typeInfo); + break; + case MAP: + schema = createAvroMap(typeInfo); + break; + case STRUCT: + schema = createAvroRecord(typeInfo); + break; + case UNION: + schema = createAvroUnion(typeInfo); + break; + } + + return schema; + } + + private Schema createAvroPrimitive(TypeInfo typeInfo) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + Schema schema; + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case STRING: + schema = Schema.create(Schema.Type.STRING); + break; + case BYTE: + schema = Schema.create(Schema.Type.BYTES); + break; + case INT: + schema = Schema.create(Schema.Type.INT); + break; + case LONG: + schema = Schema.create(Schema.Type.LONG); + break; + case FLOAT: + schema = Schema.create(Schema.Type.FLOAT); + break; + case DOUBLE: + schema = Schema.create(Schema.Type.DOUBLE); + break; + case BOOLEAN: + schema = Schema.create(Schema.Type.BOOLEAN); + break; + case DECIMAL: + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + String precision = String.valueOf(decimalTypeInfo.precision()); + String scale = String.valueOf(decimalTypeInfo.scale()); + schema = PARSER.parse("{" + + "\"type\":\"bytes\"," + + "\"logicalType\":\"decimal\"," + + "\"precision\":" + precision + "," + + "\"scale\":" + scale + "}"); + break; + default: + throw new RuntimeException("Unknown type: " + typeInfo); + } + return schema; + } + + private Schema createAvroUnion(TypeInfo typeInfo) { + List childSchemas = new ArrayList(); + for (TypeInfo childTypeInfo : ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos()) { + childSchemas.add(createAvroSchema(childTypeInfo)); + } + return Schema.createUnion(childSchemas); + } + + private Schema createAvroRecord(TypeInfo typeInfo) { + List childFields = new ArrayList(); + + final ArrayList allStructFieldNames = + ((StructTypeInfo) typeInfo).getAllStructFieldNames(); + final ArrayList allStructFieldTypeInfos = + ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos(); + if (allStructFieldNames.size() != allStructFieldTypeInfos.size()) { + throw new IllegalArgumentException("Failed to generate avro schema from hive schema. " + + "name and column type differs. names = " + allStructFieldNames + ", types = " + + allStructFieldTypeInfos); + } + + for (int i = 0; i < allStructFieldNames.size(); ++i) { + final Schema.Field grandChildSchemaField = createAvroField(allStructFieldNames.get(i), + allStructFieldTypeInfos.get(i)); + final List grandChildFields = getFields(allStructFieldTypeInfos.get(i), + grandChildSchemaField); + childFields.addAll(grandChildFields); + } + return Schema.createRecord(childFields); + } + + private Schema createAvroMap(TypeInfo typeInfo) { + TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo(); + Schema valueSchema = createAvroSchema(valueTypeInfo); + + return Schema.createMap(valueSchema); + } + + private Schema createAvroArray(TypeInfo typeInfo) { + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + Schema listSchema = createAvroSchema(listTypeInfo.getListElementTypeInfo()); + return Schema.createArray(listSchema); + } + + private List getFields(TypeInfo allStructFieldTypeInfo, Schema.Field schemaField) { + List fields = new ArrayList(); + + if (allStructFieldTypeInfo.getCategory() == ObjectInspector.Category.STRUCT) { + for (Schema.Field field : schemaField.schema().getFields()) { + fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); + } + } else { + fields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), + null)); + } + + return fields; + } +} \ No newline at end of file diff --git serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java new file mode 100644 index 0000000..8323927 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.avro; + +import junit.framework.Assert; +import org.apache.avro.Schema; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; +import org.apache.log4j.Logger; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TestTypeInfoToSchema { + + private Logger LOGGER = Logger.getLogger(TestTypeInfoToSchema.class); + private static final List columnNames = Arrays.asList("testCol"); + private TypeInfo columnType; + private static TypeInfoToSchema typeInfoToSchema = new TypeInfoToSchema(); + + private final TypeInfo STRING = TypeInfoFactory.getPrimitiveTypeInfo("string"); + private final TypeInfo INT = TypeInfoFactory.getPrimitiveTypeInfo("int"); + private final TypeInfo BOOLEAN = TypeInfoFactory.getPrimitiveTypeInfo("boolean"); + private final TypeInfo LONG = TypeInfoFactory.getPrimitiveTypeInfo("bigint"); + private final TypeInfo FLOAT = TypeInfoFactory.getPrimitiveTypeInfo("float"); + private final TypeInfo DOUBLE = TypeInfoFactory.getPrimitiveTypeInfo("double"); + private final TypeInfo BYTE = TypeInfoFactory.getPrimitiveTypeInfo("tinyint"); + private final int PRECISION = 4; + private final int SCALE = 2; + private final TypeInfo DECIMAL = TypeInfoFactory.getPrimitiveTypeInfo( + new DecimalTypeInfo(PRECISION, SCALE).getQualifiedName()); + + private String getAvroSchemaString() { + return typeInfoToSchema.convert( + columnNames, + Arrays.asList(columnType), + "org.apache.hive.avro.testing", + "avrotest", + "This is to test hive-avro").toString(); + } + + private String genSchema(String specificSchema) { + return "{" + + "\"type\":\"record\"," + + "\"name\":\"avrotest\"," + + "\"namespace\":\"org.apache.hive.avro.testing\"," + + "\"doc\":\"This is to test hive-avro\"," + + "\"fields\":[" + + "{\"name\":\"testCol\"," + + "\"type\":" + specificSchema + "," + + "\"doc\":\"\"}" + + "]}"; + } + + @Test + public void createAvroStringSchema() { + columnType = STRING; + final String specificSchema = "\"string\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for string's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroByteSchema() { + columnType = BYTE; + final String specificSchema = "\"bytes\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for string's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroIntSchema() { + columnType = INT; + final String specificSchema = "\"int\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for int's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroLongSchema() { + columnType = LONG; + final String specificSchema = "\"long\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for long's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroFloatSchema() { + columnType = FLOAT; + final String specificSchema = "\"float\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for float's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroDoubleSchema() { + columnType = DOUBLE; + final String specificSchema = "\"double\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for double's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroBooleanSchema() { + columnType = BOOLEAN; + final String specificSchema = "\"boolean\""; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for boolean's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroDecimalSchema() { + columnType = DECIMAL; + final String specificSchema = "{" + + "\"type\":\"bytes\"," + + "\"logicalType\":\"decimal\"," + + "\"precision\":" + PRECISION + "," + + "\"scale\":" + SCALE + "}"; + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for decimal's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroListSchema() { + ListTypeInfo listTypeInfo = new ListTypeInfo(); + listTypeInfo.setListElementTypeInfo(STRING); + columnType = listTypeInfo; + + final String specificSchema = Schema.createArray( + Schema.create(Schema.Type.STRING)).toString(); + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for list's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroMapSchema() { + MapTypeInfo mapTypeInfo = new MapTypeInfo(); + mapTypeInfo.setMapKeyTypeInfo(STRING); + mapTypeInfo.setMapValueTypeInfo(INT); + columnType = mapTypeInfo; + + final String specificSchema = Schema.createMap( + Schema.create(Schema.Type.INT)).toString(); + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for map's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroUnionSchema() { + UnionTypeInfo unionTypeInfo = new UnionTypeInfo(); + unionTypeInfo.setAllUnionObjectTypeInfos(Arrays.asList(INT, FLOAT, STRING)); + columnType = unionTypeInfo; + + final String specificSchema = Schema.createUnion( + Arrays.asList( + Schema.create(Schema.Type.INT), + Schema.create(Schema.Type.FLOAT), + Schema.create(Schema.Type.STRING))).toString(); + String expectedSchema = genSchema(specificSchema); + + Assert.assertEquals("Test for union's avro schema failed", + expectedSchema, getAvroSchemaString()); + } + + @Test + public void createAvroStructSchema() { + StructTypeInfo structTypeInfo = new StructTypeInfo(); + ArrayList names = new ArrayList(); + names.add("field1"); + names.add("field2"); + names.add("field3"); + structTypeInfo.setAllStructFieldNames(names); + ArrayList typeInfos = new ArrayList(); + typeInfos.add(INT); + typeInfos.add(STRING); + typeInfos.add(FLOAT); + structTypeInfo.setAllStructFieldTypeInfos(typeInfos); + LOGGER.info("structTypeInfo is " + structTypeInfo); + columnType = structTypeInfo; + + String expectedSchema = + "{" + + "\"type\":\"record\"," + + "\"name\":\"avrotest\"," + + "\"namespace\":\"org.apache.hive.avro.testing\"," + + "\"doc\":\"This is to test hive-avro\"," + + "\"fields\":" + + "[" + + "{\"name\":\"field1\",\"type\":\"int\",\"doc\":\"\"}," + + "{\"name\":\"field2\",\"type\":\"string\",\"doc\":\"\"}," + + "{\"name\":\"field3\",\"type\":\"float\",\"doc\":\"\"}" + + "]}"; + + Assert.assertEquals("Test for struct's avro schema failed", + expectedSchema, getAvroSchemaString()); + } +}