From 32885bbd5a3f4f224e9fafc94c146925d740049c Mon Sep 17 00:00:00 2001 From: Peter Slawski Date: Mon, 27 Apr 2015 17:57:50 -0700 Subject: [PATCH] HIVE-10538: Fix NPE in FileSinkOperator from hashcode mismatch This fixes a Null Pointer Exception occuring in FileSinkOperator when using bucketed tables, parition columns, and with multiFileSpray enabled. The NPE comes from a mismatch of hashcodes that ReduceSinkOperator and FileSinkOperator compute for a given row. This leads to FileSinkOperator computing the wrong bucket number when getting the writer offset. --- .../test/resources/testconfiguration.properties | 1 + .../hadoop/hive/ql/exec/ReduceSinkOperator.java | 3 +- ql/src/test/queries/clientpositive/bucket_many.q | 16 ++ .../test/results/clientpositive/bucket_many.q.out | 230 +++++++++++++++++++++ 4 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 ql/src/test/queries/clientpositive/bucket_many.q create mode 100644 ql/src/test/results/clientpositive/bucket_many.q.out diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index b7abf0d..c56efb6 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -3,6 +3,7 @@ minimr.query.files=auto_sortmerge_join_16.q,\ bucket4.q,\ bucket5.q,\ bucket6.q,\ + bucket_many.q,\ bucket_num_reducers.q,\ bucket_num_reducers2.q,\ bucketizedhiveinputformat.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java index 468d87f..859a28f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java @@ -125,7 +125,7 @@ protected transient Object[] cachedValues; protected transient List> distinctColIndices; protected transient Random random; - protected transient int bucketNumber; + protected transient int bucketNumber = -1; /** * This two dimensional array holds key data and a corresponding Union object @@ -552,6 +552,7 @@ private BytesWritable makeValueWritable(Object row) throws Exception { // in case of bucketed table, insert the bucket number as the last column in value if (bucketEval != null) { length -= 1; + assert bucketNumber >= 0; cachedValues[length] = new Text(String.valueOf(bucketNumber)); } diff --git a/ql/src/test/queries/clientpositive/bucket_many.q b/ql/src/test/queries/clientpositive/bucket_many.q new file mode 100644 index 0000000..1f0b795 --- /dev/null +++ b/ql/src/test/queries/clientpositive/bucket_many.q @@ -0,0 +1,16 @@ +set hive.enforce.bucketing = true; +set mapred.reduce.tasks = 16; + +create table bucket_many(key int, value string) clustered by (key) into 256 buckets; + +explain extended +insert overwrite table bucket_many +select * from src; + +insert overwrite table bucket_many +select * from src; + +explain +select * from bucket_many tablesample (bucket 1 out of 256) s; + +select * from bucket_many tablesample (bucket 1 out of 256) s; diff --git a/ql/src/test/results/clientpositive/bucket_many.q.out b/ql/src/test/results/clientpositive/bucket_many.q.out new file mode 100644 index 0000000..9f09163 --- /dev/null +++ b/ql/src/test/results/clientpositive/bucket_many.q.out @@ -0,0 +1,230 @@ +PREHOOK: query: create table bucket_many(key int, value string) clustered by (key) into 256 buckets +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_many +POSTHOOK: query: create table bucket_many(key int, value string) clustered by (key) into 256 buckets +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_many +PREHOOK: query: explain extended +insert overwrite table bucket_many +select * from src +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +insert overwrite table bucket_many +select * from src +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + src + TOK_INSERT + TOK_DESTINATION + TOK_TAB + TOK_TABNAME + bucket_many + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: string), _col1 (type: string) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [src] + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: UDFToInteger(VALUE._col0) (type: int), VALUE._col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 16 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 256 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.bucket_many + serialization.ddl struct bucket_many { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_many + TotalFiles: 256 + GatherStats: true + MultiFileSpray: true + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 256 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.bucket_many + serialization.ddl struct bucket_many { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_many + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table bucket_many +select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@bucket_many +POSTHOOK: query: insert overwrite table bucket_many +select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@bucket_many +POSTHOOK: Lineage: bucket_many.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucket_many.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain +select * from bucket_many tablesample (bucket 1 out of 256) s +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from bucket_many tablesample (bucket 1 out of 256) s +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: s + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((hash(key) & 2147483647) % 256) = 0) (type: boolean) + Statistics: Num rows: 27 Data size: 2853 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 27 Data size: 2853 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 27 Data size: 2853 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from bucket_many tablesample (bucket 1 out of 256) s +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_many +#### A masked pattern was here #### +POSTHOOK: query: select * from bucket_many tablesample (bucket 1 out of 256) s +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_many +#### A masked pattern was here #### +256 val_256 +0 val_0 +0 val_0 +0 val_0 +256 val_256 -- 2.3.6