From 9cc45ef0d5ebac2f5d4e5895596af9ffb45aa48c Mon Sep 17 00:00:00 2001 From: Peter Slawski Date: Mon, 27 Apr 2015 17:57:50 -0700 Subject: [PATCH] HIVE-10538: Fix NPE in FileSinkOperator from hashcode mismatch This fixes a Null Pointer Exception occurring in FileSinkOperator when using bucketed tables, partition columns, and with multiFileSpray enabled. The NPE comes from a mismatch of hashcodes that ReduceSinkOperator and FileSinkOperator compute for a given row. This leads to FileSinkOperator computing the wrong bucket number when getting the writer offset. --- .../test/resources/testconfiguration.properties | 1 + .../hadoop/hive/ql/exec/ReduceSinkOperator.java | 3 +- ql/src/test/queries/clientpositive/bucket_many.q | 16 ++ .../test/results/clientpositive/bucket_many.q.out | 230 +++++++++++++++++++++ .../results/clientpositive/spark/cbo_gby.q.out | 4 +- .../clientpositive/spark/cbo_udf_udaf.q.out | 2 +- ...roupby_complex_types_multi_single_reducer.q.out | 38 ++-- .../spark/lateral_view_explode2.q.out | 4 +- .../clientpositive/spark/union_remove_25.q.out | 2 +- .../clientpositive/spark/union_top_level.q.out | 16 +- .../spark/vector_cast_constant.q.java1.7.out | 20 +- .../spark/vector_cast_constant.q.java1.8.out | 20 +- .../spark/vectorized_timestamp_funcs.q.out | 4 +- 13 files changed, 304 insertions(+), 56 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/bucket_many.q create mode 100644 ql/src/test/results/clientpositive/bucket_many.q.out diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index b7abf0d..c56efb6 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -3,6 +3,7 @@ minimr.query.files=auto_sortmerge_join_16.q,\ bucket4.q,\ bucket5.q,\ bucket6.q,\ + bucket_many.q,\ bucket_num_reducers.q,\ bucket_num_reducers2.q,\ bucketizedhiveinputformat.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java index 468d87f..859a28f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java @@ -125,7 +125,7 @@ protected transient Object[] cachedValues; protected transient List> distinctColIndices; protected transient Random random; - protected transient int bucketNumber; + protected transient int bucketNumber = -1; /** * This two dimensional array holds key data and a corresponding Union object @@ -552,6 +552,7 @@ private BytesWritable makeValueWritable(Object row) throws Exception { // in case of bucketed table, insert the bucket number as the last column in value if (bucketEval != null) { length -= 1; + assert bucketNumber >= 0; cachedValues[length] = new Text(String.valueOf(bucketNumber)); } diff --git a/ql/src/test/queries/clientpositive/bucket_many.q b/ql/src/test/queries/clientpositive/bucket_many.q new file mode 100644 index 0000000..1f0b795 --- /dev/null +++ b/ql/src/test/queries/clientpositive/bucket_many.q @@ -0,0 +1,16 @@ +set hive.enforce.bucketing = true; +set mapred.reduce.tasks = 16; + +create table bucket_many(key int, value string) clustered by (key) into 256 buckets; + +explain extended +insert overwrite table bucket_many +select * from src; + +insert overwrite table bucket_many +select * from src; + +explain +select * from bucket_many tablesample (bucket 1 out of 256) s; + +select * from bucket_many tablesample (bucket 1 out of 256) s; diff --git a/ql/src/test/results/clientpositive/bucket_many.q.out b/ql/src/test/results/clientpositive/bucket_many.q.out new file mode 100644 index 0000000..9f09163 --- /dev/null +++ b/ql/src/test/results/clientpositive/bucket_many.q.out @@ -0,0 +1,230 @@ +PREHOOK: query: create table bucket_many(key int, value string) clustered by (key) into 256 buckets +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bucket_many +POSTHOOK: query: create table bucket_many(key int, value string) clustered by (key) into 256 buckets +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bucket_many +PREHOOK: query: explain extended +insert overwrite table bucket_many +select * from src +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +insert overwrite table bucket_many +select * from src +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + src + TOK_INSERT + TOK_DESTINATION + TOK_TAB + TOK_TABNAME + bucket_many + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: string), _col1 (type: string) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [src] + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: UDFToInteger(VALUE._col0) (type: int), VALUE._col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 16 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 256 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.bucket_many + serialization.ddl struct bucket_many { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_many + TotalFiles: 256 + GatherStats: true + MultiFileSpray: true + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 256 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.bucket_many + serialization.ddl struct bucket_many { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_many + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table bucket_many +select * from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@bucket_many +POSTHOOK: query: insert overwrite table bucket_many +select * from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@bucket_many +POSTHOOK: Lineage: bucket_many.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucket_many.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain +select * from bucket_many tablesample (bucket 1 out of 256) s +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from bucket_many tablesample (bucket 1 out of 256) s +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: s + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((hash(key) & 2147483647) % 256) = 0) (type: boolean) + Statistics: Num rows: 27 Data size: 2853 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 27 Data size: 2853 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 27 Data size: 2853 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from bucket_many tablesample (bucket 1 out of 256) s +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_many +#### A masked pattern was here #### +POSTHOOK: query: select * from bucket_many tablesample (bucket 1 out of 256) s +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_many +#### A masked pattern was here #### +256 val_256 +0 val_0 +0 val_0 +0 val_0 +256 val_256 diff --git a/ql/src/test/results/clientpositive/spark/cbo_gby.q.out b/ql/src/test/results/clientpositive/spark/cbo_gby.q.out index 67c7a63..9ca8a88 100644 --- a/ql/src/test/results/clientpositive/spark/cbo_gby.q.out +++ b/ql/src/test/results/clientpositive/spark/cbo_gby.q.out @@ -11,10 +11,10 @@ POSTHOOK: Input: default@cbo_t1 POSTHOOK: Input: default@cbo_t1@dt=2014 #### A masked pattern was here #### 1 4 12 + 1 4 2 NULL NULL NULL 1 4 2 1 4 2 - 1 4 2 PREHOOK: query: select x, y, count(*) from (select key, (c_int+c_float+1+2) as x, sum(c_int) as y from cbo_t1 group by c_float, cbo_t1.c_int, key) R group by y, x PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -25,9 +25,9 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@cbo_t1 POSTHOOK: Input: default@cbo_t1@dt=2014 #### A masked pattern was here #### +5.0 2 3 NULL NULL 1 5.0 12 1 -5.0 2 3 PREHOOK: query: select cbo_t3.c_int, c, count(*) from (select key as a, c_int+1 as b, sum(c_int) as c from cbo_t1 where (cbo_t1.c_int + 1 >= 0) and (cbo_t1.c_int > 0 or cbo_t1.c_float >= 0) group by c_float, cbo_t1.c_int, key order by a) cbo_t1 join (select key as p, c_int+1 as q, sum(c_int) as r from cbo_t2 where (cbo_t2.c_int + 1 >= 0) and (cbo_t2.c_int > 0 or cbo_t2.c_float >= 0) group by c_float, cbo_t2.c_int, key order by q/10 desc, r asc) cbo_t2 on cbo_t1.a=p join cbo_t3 on cbo_t1.a=key where (b + cbo_t2.q >= 0) and (b > 0 or c_int >= 0) group by cbo_t3.c_int, c order by cbo_t3.c_int+c desc, c PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 diff --git a/ql/src/test/results/clientpositive/spark/cbo_udf_udaf.q.out b/ql/src/test/results/clientpositive/spark/cbo_udf_udaf.q.out index 932943d..ded043f 100644 --- a/ql/src/test/results/clientpositive/spark/cbo_udf_udaf.q.out +++ b/ql/src/test/results/clientpositive/spark/cbo_udf_udaf.q.out @@ -79,9 +79,9 @@ POSTHOOK: Input: default@cbo_t1 POSTHOOK: Input: default@cbo_t1@dt=2014 #### A masked pattern was here #### NULL 0 NULL +1 2 1.0 1 2 1.0 1 2 1.0 -1 2 1.0 1 12 1.0 PREHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float order by a PREHOOK: type: QUERY diff --git a/ql/src/test/results/clientpositive/spark/groupby_complex_types_multi_single_reducer.q.out b/ql/src/test/results/clientpositive/spark/groupby_complex_types_multi_single_reducer.q.out index 9118845..9fe3b72 100644 --- a/ql/src/test/results/clientpositive/spark/groupby_complex_types_multi_single_reducer.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby_complex_types_multi_single_reducer.q.out @@ -204,16 +204,16 @@ POSTHOOK: query: SELECT DEST1.* FROM DEST1 POSTHOOK: type: QUERY POSTHOOK: Input: default@dest1 #### A masked pattern was here #### -["166"] 1 -["169"] 4 +["118"] 2 +["180"] 1 +["201"] 1 +["202"] 1 ["238"] 2 -["258"] 1 -["306"] 1 -["384"] 3 -["392"] 1 -["435"] 1 -["455"] 1 -["468"] 4 +["273"] 3 +["282"] 2 +["419"] 1 +["432"] 1 +["467"] 1 PREHOOK: query: SELECT DEST2.* FROM DEST2 PREHOOK: type: QUERY PREHOOK: Input: default@dest2 @@ -222,13 +222,13 @@ POSTHOOK: query: SELECT DEST2.* FROM DEST2 POSTHOOK: type: QUERY POSTHOOK: Input: default@dest2 #### A masked pattern was here #### -{"120":"val_120"} 2 -{"129":"val_129"} 2 -{"160":"val_160"} 1 -{"26":"val_26"} 2 -{"27":"val_27"} 1 -{"288":"val_288"} 2 -{"298":"val_298"} 3 -{"30":"val_30"} 1 -{"311":"val_311"} 3 -{"74":"val_74"} 1 +{"0":"val_0"} 3 +{"138":"val_138"} 4 +{"170":"val_170"} 1 +{"19":"val_19"} 1 +{"222":"val_222"} 1 +{"223":"val_223"} 2 +{"226":"val_226"} 1 +{"489":"val_489"} 4 +{"8":"val_8"} 1 +{"80":"val_80"} 1 diff --git a/ql/src/test/results/clientpositive/spark/lateral_view_explode2.q.out b/ql/src/test/results/clientpositive/spark/lateral_view_explode2.q.out index a5c95b5..41d60f5 100644 --- a/ql/src/test/results/clientpositive/spark/lateral_view_explode2.q.out +++ b/ql/src/test/results/clientpositive/spark/lateral_view_explode2.q.out @@ -93,9 +93,9 @@ POSTHOOK: query: SELECT col1, col2 FROM src LATERAL VIEW explode2(array(1,2,3)) POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -2 2 -1 1 3 3 +1 1 +2 2 PREHOOK: query: DROP TEMPORARY FUNCTION explode2 PREHOOK: type: DROPFUNCTION PREHOOK: Output: explode2 diff --git a/ql/src/test/results/clientpositive/spark/union_remove_25.q.out b/ql/src/test/results/clientpositive/spark/union_remove_25.q.out index f32aaea..5853cc0 100644 --- a/ql/src/test/results/clientpositive/spark/union_remove_25.q.out +++ b/ql/src/test/results/clientpositive/spark/union_remove_25.q.out @@ -424,7 +424,7 @@ Partition Parameters: numFiles 2 numRows -1 rawDataSize -1 - totalSize 6814 + totalSize 6826 #### A masked pattern was here #### # Storage Information diff --git a/ql/src/test/results/clientpositive/spark/union_top_level.q.out b/ql/src/test/results/clientpositive/spark/union_top_level.q.out index f57fc04..a64fc95 100644 --- a/ql/src/test/results/clientpositive/spark/union_top_level.q.out +++ b/ql/src/test/results/clientpositive/spark/union_top_level.q.out @@ -348,18 +348,18 @@ POSTHOOK: Input: default@src 0 val_0 0 val_0 0 val_0 -10 val_10 -10 val_10 +0 val_0 +0 val_0 +100 val_100 +100 val_100 +100 val_100 +100 val_100 +100 val_100 +100 val_100 100 val_100 100 val_100 -103 val_103 -103 val_103 -104 val_104 -104 val_104 104 val_104 104 val_104 -111 val_111 -111 val_111 PREHOOK: query: -- ctas explain create table union_top as diff --git a/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.7.out b/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.7.out index e159c8b..609826c 100644 --- a/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.7.out +++ b/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.7.out @@ -191,13 +191,13 @@ POSTHOOK: query: SELECT POSTHOOK: type: QUERY POSTHOOK: Input: default@over1korc #### A masked pattern was here #### -65598 50.0 50.0 50 -65694 50.0 50.0 50 -65678 50.0 50.0 50 -65684 50.0 50.0 50 -65596 50.0 50.0 50 -65692 50.0 50.0 50 -65630 50.0 50.0 50 -65674 50.0 50.0 50 -65628 50.0 50.0 50 -65776 50.0 50.0 50 +65759 50.0 50.0 50 +65617 50.0 50.0 50 +65715 50.0 50.0 50 +65769 50.0 50.0 50 +65611 50.0 50.0 50 +65693 50.0 50.0 50 +65601 50.0 50.0 50 +65679 50.0 50.0 50 +65681 50.0 50.0 50 +65615 50.0 50.0 50 diff --git a/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.8.out b/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.8.out index 43c07e6..c0c53d0 100644 --- a/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.8.out +++ b/ql/src/test/results/clientpositive/spark/vector_cast_constant.q.java1.8.out @@ -191,13 +191,13 @@ POSTHOOK: query: SELECT POSTHOOK: type: QUERY POSTHOOK: Input: default@over1korc #### A masked pattern was here #### -65788 50.0 50.0 50 -65598 50.0 50.0 50 -65694 50.0 50.0 50 -65678 50.0 50.0 50 -65684 50.0 50.0 50 -65596 50.0 50.0 50 -65692 50.0 50.0 50 -65630 50.0 50.0 50 -65674 50.0 50.0 50 -65628 50.0 50.0 50 +65759 50.0 50.0 50 +65617 50.0 50.0 50 +65715 50.0 50.0 50 +65611 50.0 50.0 50 +65693 50.0 50.0 50 +65601 50.0 50.0 50 +65545 50.0 50.0 50 +65679 50.0 50.0 50 +65681 50.0 50.0 50 +65615 50.0 50.0 50 diff --git a/ql/src/test/results/clientpositive/spark/vectorized_timestamp_funcs.q.out b/ql/src/test/results/clientpositive/spark/vectorized_timestamp_funcs.q.out index 3a0c3f1..3044582 100644 --- a/ql/src/test/results/clientpositive/spark/vectorized_timestamp_funcs.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorized_timestamp_funcs.q.out @@ -768,7 +768,7 @@ FROM alltypesorc_string POSTHOOK: type: QUERY POSTHOOK: Input: default@alltypesorc_string #### A masked pattern was here #### -1123143.857 +1123143.8569999998 PREHOOK: query: EXPLAIN SELECT avg(ctimestamp1), variance(ctimestamp1), @@ -868,4 +868,4 @@ FROM alltypesorc_string POSTHOOK: type: QUERY POSTHOOK: Input: default@alltypesorc_string #### A masked pattern was here #### -2.8798560435897438E13 8.970772952794215E19 8.970772952794215E19 9.206845925236167E19 9.471416447815086E9 9.471416447815086E9 9.471416447815086E9 9.595231068211004E9 +2.8798560435897438E13 8.970772952794214E19 8.970772952794214E19 9.206845925236167E19 9.471416447815086E9 9.471416447815086E9 9.471416447815086E9 9.595231068211004E9 -- 2.4.0