diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java index 907a9b8..1b8bfe7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java @@ -93,6 +93,17 @@ protected final ColumnDescriptor descriptor; protected final Type type; + /** + * Used for VectorizedDummyColumnReader + */ + public BaseVectorizedColumnReader(){ + this.pageReader = null; + this.descriptor = null; + this.type = null; + this.dictionary = null; + this.maxDefLevel = -1; + } + public BaseVectorizedColumnReader( ColumnDescriptor descriptor, PageReader pageReader, diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java new file mode 100644 index 0000000..b45c9de --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java @@ -0,0 +1,38 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.vector; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.io.IOException; +import java.util.Arrays; + +/** + * A dummy vectorized parquet reader to support schema evolution + */ +public class VectorizedDummyColumnReader extends BaseVectorizedColumnReader { + + public VectorizedDummyColumnReader() { + super(); + } + + @Override + public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException { + Arrays.fill(column.isNull, true); + column.isRepeating = true; + column.noNulls = false; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index 08ac57b..0c58e91 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -474,9 +474,13 @@ private VectorizedColumnReader buildVectorizedParquetReader( if (columnDescriptors == null || columnDescriptors.isEmpty()) { throw new RuntimeException( "Failed to find related Parquet column descriptor with type " + type); - } else { + } + if (fileSchema.getColumns().contains(descriptors.get(0))) { return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type); + } else { + // Support for schema evolution + return new VectorizedDummyColumnReader(); } case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; diff --git ql/src/test/queries/clientpositive/parquet_vectorization_evolution.q ql/src/test/queries/clientpositive/parquet_vectorization_evolution.q new file mode 100644 index 0000000..d33c765 --- /dev/null +++ ql/src/test/queries/clientpositive/parquet_vectorization_evolution.q @@ -0,0 +1,10 @@ +create table test_p(t1 tinyint, t2 tinyint, i1 int, i2 int) stored as parquet; +insert into test_p values (1,2,3,4),(5,6,7,8); +alter table test_p add columns (ts timestamp); + +SET hive.fetch.task.conversion=none; +SET hive.vectorized.execution.enabled=true; +select * from test_p; + +insert into test_p values (1,2,3,4, '2018-01-01 01:01:01.123456'); +select * from test_p; \ No newline at end of file diff --git ql/src/test/results/clientpositive/parquet_vectorization_evolution.q.out ql/src/test/results/clientpositive/parquet_vectorization_evolution.q.out new file mode 100644 index 0000000..1df9281 --- /dev/null +++ ql/src/test/results/clientpositive/parquet_vectorization_evolution.q.out @@ -0,0 +1,62 @@ +PREHOOK: query: create table test_p(t1 tinyint, t2 tinyint, i1 int, i2 int) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_p +POSTHOOK: query: create table test_p(t1 tinyint, t2 tinyint, i1 int, i2 int) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_p +PREHOOK: query: insert into test_p values (1,2,3,4),(5,6,7,8) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_p +POSTHOOK: query: insert into test_p values (1,2,3,4),(5,6,7,8) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_p +POSTHOOK: Lineage: test_p.i1 SCRIPT [] +POSTHOOK: Lineage: test_p.i2 SCRIPT [] +POSTHOOK: Lineage: test_p.t1 SCRIPT [] +POSTHOOK: Lineage: test_p.t2 SCRIPT [] +PREHOOK: query: alter table test_p add columns (ts timestamp) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@test_p +PREHOOK: Output: default@test_p +POSTHOOK: query: alter table test_p add columns (ts timestamp) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@test_p +POSTHOOK: Output: default@test_p +PREHOOK: query: select * from test_p +PREHOOK: type: QUERY +PREHOOK: Input: default@test_p +#### A masked pattern was here #### +POSTHOOK: query: select * from test_p +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_p +#### A masked pattern was here #### +1 2 3 4 NULL +5 6 7 8 NULL +PREHOOK: query: insert into test_p values (1,2,3,4, '2018-01-01 01:01:01.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_p +POSTHOOK: query: insert into test_p values (1,2,3,4, '2018-01-01 01:01:01.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_p +POSTHOOK: Lineage: test_p.i1 SCRIPT [] +POSTHOOK: Lineage: test_p.i2 SCRIPT [] +POSTHOOK: Lineage: test_p.t1 SCRIPT [] +POSTHOOK: Lineage: test_p.t2 SCRIPT [] +POSTHOOK: Lineage: test_p.ts SCRIPT [] +PREHOOK: query: select * from test_p +PREHOOK: type: QUERY +PREHOOK: Input: default@test_p +#### A masked pattern was here #### +POSTHOOK: query: select * from test_p +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_p +#### A masked pattern was here #### +1 2 3 4 NULL +5 6 7 8 NULL +1 2 3 4 2018-01-01 01:01:01.123456