commit 42f6a2a2ec145836b6fcfea0736308f5b385cd4a Author: Vihang Karajgaonkar Date: Thu Nov 2 15:57:41 2017 -0700 HIVE-17961 : NPE during initialization of VectorizedParquetRecordReader when input split is null diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index dd1e1a976c672b2d0314d21c69f8e090cc70fe44..7c76b7363f1baca16d4e4a4ef72570c2f487a36b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -146,6 +146,10 @@ public VectorizedParquetRecordReader( this.cacheConf = cacheConf; serDeStats = new SerDeStats(); projectionPusher = new ProjectionPusher(); + colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf); + //initialize the rowbatchContext + jobConf = conf; + rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf); ParquetInputSplit inputSplit = getSplit(oldInputSplit, conf); if (inputSplit != null) { initialize(inputSplit, conf); @@ -171,10 +175,6 @@ private void initPartitionValues(FileSplit fileSplit, JobConf conf) throws IOExc public void initialize( InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException { - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration); - //initialize the rowbatchContext - jobConf = configuration; - rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf); // the oldSplit may be null during the split phase if (oldSplit == null) { return; diff --git a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q index 8865c797419bc2b85bc301007ddaa2bb3643822c..56f8909991197434187d06278a7ab8e447849e0d 100644 --- a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q +++ b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q @@ -77,3 +77,20 @@ group by m1["color"]; select m1["color"], count(*) from parquet_project_test group by m1["color"]; + + +create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet; + +insert into table parquet_nullsplit partition(len=1) +values ('one', 'red'); + +explain vectorization select count(*) from parquet_nullsplit where len = 1; +select count(*) from parquet_nullsplit where len = 1; + +explain vectorization select count(*) from parquet_nullsplit where len = 99; +select count(*) from parquet_nullsplit where len = 99; + +drop table parquet_nullsplit; +drop table parquet_project_test; +drop table parquet_types_staging; diff --git a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out index faa22f9dcce4dc43d686143db3a05c6fef1061b0..3af758fa36d425cbec63f8f9415749e946013d02 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out @@ -456,3 +456,214 @@ POSTHOOK: Input: default@parquet_project_test blue 7 green 7 red 8 +PREHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +PREHOOK: type: QUERY +PREHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).val SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +1 +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (len = 99) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +0 +PREHOOK: query: drop table parquet_nullsplit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: drop table parquet_nullsplit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: drop table parquet_project_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_project_test +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: drop table parquet_project_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_project_test +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: drop table parquet_types_staging +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: drop table parquet_types_staging +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_types_staging diff --git a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out index bd8fbb15de8a6fa097eec317a633af060219f500..2b4c801b06724a7f0c28554115b8ec80118cb729 100644 --- a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out +++ b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out @@ -426,3 +426,196 @@ POSTHOOK: Input: default@parquet_project_test blue 7 green 7 red 8 +PREHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: create table if not exists parquet_nullsplit(key string, val string) partitioned by (len int) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +PREHOOK: type: QUERY +PREHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: query: insert into table parquet_nullsplit partition(len=1) +values ('one', 'red') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquet_nullsplit@len=1 +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).key SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquet_nullsplit PARTITION(len=1).val SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Input: default@parquet_nullsplit@len=1 +#### A masked pattern was here #### +1 +PREHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_nullsplit + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (len = 99) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_nullsplit where len = 99 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_nullsplit where len = 99 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_nullsplit +#### A masked pattern was here #### +0 +PREHOOK: query: drop table parquet_nullsplit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_nullsplit +PREHOOK: Output: default@parquet_nullsplit +POSTHOOK: query: drop table parquet_nullsplit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_nullsplit +POSTHOOK: Output: default@parquet_nullsplit +PREHOOK: query: drop table parquet_project_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_project_test +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: drop table parquet_project_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_project_test +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: drop table parquet_types_staging +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: drop table parquet_types_staging +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_types_staging