From 2d0c8593db9c9ce2ae77f667abe67ba1fc75da39 Mon Sep 17 00:00:00 2001 From: Naresh P R Date: Sat, 7 Sep 2019 13:57:23 +0530 Subject: [PATCH] HIVE-22178 - Parquet FilterPredicate throws CastException after SchemaEvolution. --- .../hive/ql/io/parquet/LeafFilterFactory.java | 43 ++- .../read/TestParquetFilterPredicate.java | 4 +- .../clientpositive/parquet_schema_evolution.q | 25 ++ .../parquet_schema_evolution.q.out | 359 ++++++++++++++++++ 4 files changed, 415 insertions(+), 16 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java index be4c0d55a1..fc9188f8f6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java @@ -174,23 +174,10 @@ public FilterPredicateLeafBuilder getLeafFilterBuilderByType( Type parquetType) throws HiveException { switch (type){ case LONG: - if (parquetType.asPrimitiveType().getPrimitiveTypeName() == - PrimitiveType.PrimitiveTypeName.INT32) { - return new IntFilterPredicateLeafBuilder(); - } else { - return new LongFilterPredicateLeafBuilder(); - } case FLOAT: - if (parquetType.asPrimitiveType().getPrimitiveTypeName() == - PrimitiveType.PrimitiveTypeName.FLOAT) { - return new FloatFilterPredicateLeafBuilder(); - } else { - return new DoubleFilterPredicateLeafBuilder(); - } case STRING: // string, char, varchar - return new BinaryFilterPredicateLeafBuilder(); case BOOLEAN: - return new BooleanFilterPredicateLeafBuilder(); + return getLeafFilterBuilderByParquetType(parquetType); case DATE: case DECIMAL: case TIMESTAMP: @@ -200,4 +187,32 @@ public FilterPredicateLeafBuilder getLeafFilterBuilderByType( throw new HiveException(msg); } } + + /** + * Creates FilterPredicateLeafBuilder as per Parquet FileSchema type + * @param parquetType + * @return + * @throws HiveException + */ + private FilterPredicateLeafBuilder getLeafFilterBuilderByParquetType(Type parquetType) throws HiveException { + switch (parquetType.asPrimitiveType().getPrimitiveTypeName()){ + case INT32: // TINYINT, SMALLINT, INT + return new IntFilterPredicateLeafBuilder(); + case INT64: // LONG + return new LongFilterPredicateLeafBuilder(); + case FLOAT: + return new FloatFilterPredicateLeafBuilder(); + case DOUBLE: + return new DoubleFilterPredicateLeafBuilder(); + case BINARY: // STRING, CHAR, VARCHAR + return new BinaryFilterPredicateLeafBuilder(); + case BOOLEAN: + return new BooleanFilterPredicateLeafBuilder(); + default: + String msg = "Conversion to Parquet FilterPredicate not supported for " + + parquetType.asPrimitiveType().getPrimitiveTypeName(); + LOG.debug(msg); + throw new HiveException(msg); + } + } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java index d46404648d..7c7c657524 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java @@ -38,14 +38,14 @@ public void testFilterColumnsThatDoNoExistOnSchema() { .isNull("a", PredicateLeaf.Type.LONG) .between("y", PredicateLeaf.Type.LONG, 10L, 20L) // Column will be removed from filter .in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L) // Column will be removed from filter - .nullSafeEquals("a", PredicateLeaf.Type.STRING, "stinger") + .nullSafeEquals("stinger", PredicateLeaf.Type.STRING, "stinger") .end() .end() .build(); FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema); - String expected = "and(not(eq(a, null)), not(eq(a, Binary{\"stinger\"})))"; + String expected = "and(not(eq(a, null)), not(eq(stinger, Binary{\"stinger\"})))"; assertEquals(expected, p.toString()); } diff --git a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q index 4f593af5c7..a82b641efe 100644 --- a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q +++ b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q @@ -44,3 +44,28 @@ SELECT * FROM schema_test; DROP TABLE schema_test; DROP TABLE NewStructField; DROP TABLE NewStructFieldTable; + +drop table if exists parq_test; +create table parq_test(age int, name string) stored as parquet; +insert into parq_test values(1, 'aaaa'); + +DESCRIBE parq_test; +alter table parq_test change age age string; +DESCRIBE parq_test; + +insert into parq_test values('b', 'bbbb'); + +select * from parq_test; +select * from parq_test where age='b'; +select * from parq_test where age='1'; +select * from parq_test where age=1; + +explain select * from parq_test where age='b'; +explain select * from parq_test where age='1'; +explain select * from parq_test where age=1; + +explain vectorization expression select * from parq_test where age='b'; +explain vectorization expression select * from parq_test where age='1'; +explain vectorization expression select * from parq_test where age=1; + +drop table parq_test; diff --git a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out index 43d75dc105..3c38ed0705 100644 --- a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out +++ b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out @@ -188,3 +188,362 @@ POSTHOOK: query: DROP TABLE NewStructFieldTable POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@newstructfieldtable POSTHOOK: Output: default@newstructfieldtable +PREHOOK: query: drop table if exists parq_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists parq_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table parq_test(age int, name string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parq_test +POSTHOOK: query: create table parq_test(age int, name string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parq_test +PREHOOK: query: insert into parq_test values(1, 'aaaa') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parq_test +POSTHOOK: query: insert into parq_test values(1, 'aaaa') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parq_test +POSTHOOK: Lineage: parq_test.age SCRIPT [] +POSTHOOK: Lineage: parq_test.name SCRIPT [] +PREHOOK: query: DESCRIBE parq_test +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parq_test +POSTHOOK: query: DESCRIBE parq_test +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parq_test +age int +name string +PREHOOK: query: alter table parq_test change age age string +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@parq_test +PREHOOK: Output: default@parq_test +POSTHOOK: query: alter table parq_test change age age string +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@parq_test +POSTHOOK: Output: default@parq_test +PREHOOK: query: DESCRIBE parq_test +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parq_test +POSTHOOK: query: DESCRIBE parq_test +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parq_test +age string +name string +PREHOOK: query: insert into parq_test values('b', 'bbbb') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parq_test +POSTHOOK: query: insert into parq_test values('b', 'bbbb') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parq_test +POSTHOOK: Lineage: parq_test.age SCRIPT [] +POSTHOOK: Lineage: parq_test.name SCRIPT [] +PREHOOK: query: select * from parq_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +1 aaaa +b bbbb +PREHOOK: query: select * from parq_test where age='b' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test where age='b' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +b bbbb +PREHOOK: query: select * from parq_test where age='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test where age='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +1 aaaa +PREHOOK: query: select * from parq_test where age=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test where age=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +1 aaaa +PREHOOK: query: explain select * from parq_test where age='b' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain select * from parq_test where age='b' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: 'b' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from parq_test where age='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain select * from parq_test where age='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: '1' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from parq_test where age=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain select * from parq_test where age=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: age (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain vectorization expression select * from parq_test where age='b' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression select * from parq_test where age='b' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: false + enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: 'b' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain vectorization expression select * from parq_test where age='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression select * from parq_test where age='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: false + enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: '1' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain vectorization expression select * from parq_test where age=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression select * from parq_test where age=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: false + enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: age (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table parq_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parq_test +PREHOOK: Output: default@parq_test +POSTHOOK: query: drop table parq_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parq_test +POSTHOOK: Output: default@parq_test -- 2.18.0