diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java index be4c0d55a1..fc9188f8f6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java @@ -174,23 +174,10 @@ public FilterPredicateLeafBuilder getLeafFilterBuilderByType( Type parquetType) throws HiveException { switch (type){ case LONG: - if (parquetType.asPrimitiveType().getPrimitiveTypeName() == - PrimitiveType.PrimitiveTypeName.INT32) { - return new IntFilterPredicateLeafBuilder(); - } else { - return new LongFilterPredicateLeafBuilder(); - } case FLOAT: - if (parquetType.asPrimitiveType().getPrimitiveTypeName() == - PrimitiveType.PrimitiveTypeName.FLOAT) { - return new FloatFilterPredicateLeafBuilder(); - } else { - return new DoubleFilterPredicateLeafBuilder(); - } case STRING: // string, char, varchar - return new BinaryFilterPredicateLeafBuilder(); case BOOLEAN: - return new BooleanFilterPredicateLeafBuilder(); + return getLeafFilterBuilderByParquetType(parquetType); case DATE: case DECIMAL: case TIMESTAMP: @@ -200,4 +187,32 @@ public FilterPredicateLeafBuilder getLeafFilterBuilderByType( throw new HiveException(msg); } } + + /** + * Creates FilterPredicateLeafBuilder as per Parquet FileSchema type + * @param parquetType + * @return + * @throws HiveException + */ + private FilterPredicateLeafBuilder getLeafFilterBuilderByParquetType(Type parquetType) throws HiveException { + switch (parquetType.asPrimitiveType().getPrimitiveTypeName()){ + case INT32: // TINYINT, SMALLINT, INT + return new IntFilterPredicateLeafBuilder(); + case INT64: // LONG + return new LongFilterPredicateLeafBuilder(); + case FLOAT: + return new FloatFilterPredicateLeafBuilder(); + case DOUBLE: + return new DoubleFilterPredicateLeafBuilder(); + case BINARY: // STRING, CHAR, VARCHAR + return new BinaryFilterPredicateLeafBuilder(); + case BOOLEAN: + return new BooleanFilterPredicateLeafBuilder(); + default: + String msg = "Conversion to Parquet FilterPredicate not supported for " + + parquetType.asPrimitiveType().getPrimitiveTypeName(); + LOG.debug(msg); + throw new HiveException(msg); + } + } } diff --git a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q index 4f593af5c7..a82b641efe 100644 --- a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q +++ b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q @@ -44,3 +44,28 @@ SELECT * FROM schema_test; DROP TABLE schema_test; DROP TABLE NewStructField; DROP TABLE NewStructFieldTable; + +drop table if exists parq_test; +create table parq_test(age int, name string) stored as parquet; +insert into parq_test values(1, 'aaaa'); + +DESCRIBE parq_test; +alter table parq_test change age age string; +DESCRIBE parq_test; + +insert into parq_test values('b', 'bbbb'); + +select * from parq_test; +select * from parq_test where age='b'; +select * from parq_test where age='1'; +select * from parq_test where age=1; + +explain select * from parq_test where age='b'; +explain select * from parq_test where age='1'; +explain select * from parq_test where age=1; + +explain vectorization expression select * from parq_test where age='b'; +explain vectorization expression select * from parq_test where age='1'; +explain vectorization expression select * from parq_test where age=1; + +drop table parq_test; diff --git a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out index 43d75dc105..3c38ed0705 100644 --- a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out +++ b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out @@ -188,3 +188,362 @@ POSTHOOK: query: DROP TABLE NewStructFieldTable POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@newstructfieldtable POSTHOOK: Output: default@newstructfieldtable +PREHOOK: query: drop table if exists parq_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists parq_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table parq_test(age int, name string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parq_test +POSTHOOK: query: create table parq_test(age int, name string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parq_test +PREHOOK: query: insert into parq_test values(1, 'aaaa') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parq_test +POSTHOOK: query: insert into parq_test values(1, 'aaaa') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parq_test +POSTHOOK: Lineage: parq_test.age SCRIPT [] +POSTHOOK: Lineage: parq_test.name SCRIPT [] +PREHOOK: query: DESCRIBE parq_test +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parq_test +POSTHOOK: query: DESCRIBE parq_test +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parq_test +age int +name string +PREHOOK: query: alter table parq_test change age age string +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@parq_test +PREHOOK: Output: default@parq_test +POSTHOOK: query: alter table parq_test change age age string +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@parq_test +POSTHOOK: Output: default@parq_test +PREHOOK: query: DESCRIBE parq_test +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@parq_test +POSTHOOK: query: DESCRIBE parq_test +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@parq_test +age string +name string +PREHOOK: query: insert into parq_test values('b', 'bbbb') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parq_test +POSTHOOK: query: insert into parq_test values('b', 'bbbb') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parq_test +POSTHOOK: Lineage: parq_test.age SCRIPT [] +POSTHOOK: Lineage: parq_test.name SCRIPT [] +PREHOOK: query: select * from parq_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +1 aaaa +b bbbb +PREHOOK: query: select * from parq_test where age='b' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test where age='b' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +b bbbb +PREHOOK: query: select * from parq_test where age='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test where age='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +1 aaaa +PREHOOK: query: select * from parq_test where age=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parq_test where age=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +1 aaaa +PREHOOK: query: explain select * from parq_test where age='b' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain select * from parq_test where age='b' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: 'b' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from parq_test where age='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain select * from parq_test where age='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: '1' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from parq_test where age=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain select * from parq_test where age=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: age (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain vectorization expression select * from parq_test where age='b' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression select * from parq_test where age='b' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: false + enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = 'b') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: 'b' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain vectorization expression select * from parq_test where age='1' +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression select * from parq_test where age='1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: false + enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (age = '1') (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: '1' (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain vectorization expression select * from parq_test where age=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parq_test +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression select * from parq_test where age=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parq_test +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: false + enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parq_test + filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: (UDFToDouble(age) = 1.0D) (type: boolean) + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: age (type: string), name (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table parq_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parq_test +PREHOOK: Output: default@parq_test +POSTHOOK: query: drop table parq_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parq_test +POSTHOOK: Output: default@parq_test