From 2db93c7eff436019d6c3b4a1ce387b49ede6cd43 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Thu, 5 Nov 2015 18:25:24 -0800 Subject: [PATCH] HIVE-12232 : BucketingSortingReduceSinkOptimizer throws IOB exception for duplicate columns --- .../BucketingSortingReduceSinkOptimizer.java | 6 ++ .../clientpositive/insertoverwrite_bucket.q | 9 +++ .../clientpositive/insertoverwrite_bucket.q.out | 78 ++++++++++++++++++++++ 3 files changed, 93 insertions(+) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java index a090a5b..d5df34c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketingSortingReduceSinkOptimizer.java @@ -588,6 +588,12 @@ else if (op instanceof SelectOperator) { } // Only columns can be selected for both sorted and bucketed positions for (int pos : bucketPositions) { + if (pos >= selectDesc.getColList().size()) { + // e.g., INSERT OVERWRITE TABLE temp1 SELECT c0, c0 FROM temp2; + // In such a case Select Op will only have one instance of c0 and RS would have two. + // So, locating bucketCol in such cases will generate error. So, bail out. + return null; + } ExprNodeDesc selectColList = selectDesc.getColList().get(pos); if (!(selectColList instanceof ExprNodeColumnDesc)) { return null; diff --git a/ql/src/test/queries/clientpositive/insertoverwrite_bucket.q b/ql/src/test/queries/clientpositive/insertoverwrite_bucket.q index d939710..5a10f94 100644 --- a/ql/src/test/queries/clientpositive/insertoverwrite_bucket.q +++ b/ql/src/test/queries/clientpositive/insertoverwrite_bucket.q @@ -18,10 +18,19 @@ insert into table bucketinput values ("firstinsert3"); set hive.enforce.bucketing = true; set hive.enforce.sorting=true; insert overwrite table bucketoutput1 select * from bucketinput where data like 'first%'; +CREATE TABLE temp1 +( + change string, + num string +) +CLUSTERED BY (num) SORTED BY (num) INTO 4 BUCKETS; +explain insert overwrite table temp1 select data, data from bucketinput; + set hive.auto.convert.sortmerge.join=true; set hive.optimize.bucketmapjoin = true; set hive.optimize.bucketmapjoin.sortedmerge = true; select * from bucketoutput1 a join bucketoutput2 b on (a.data=b.data); +drop table temp1; drop table buckettestinput; drop table buckettestoutput1; drop table buckettestoutput2; diff --git a/ql/src/test/results/clientpositive/insertoverwrite_bucket.q.out b/ql/src/test/results/clientpositive/insertoverwrite_bucket.q.out index 9b7b85d..4add20c 100644 --- a/ql/src/test/results/clientpositive/insertoverwrite_bucket.q.out +++ b/ql/src/test/results/clientpositive/insertoverwrite_bucket.q.out @@ -80,6 +80,76 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@bucketinput POSTHOOK: Output: default@bucketoutput1 POSTHOOK: Lineage: bucketoutput1.data SIMPLE [(bucketinput)bucketinput.FieldSchema(name:data, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE temp1 +( + change string, + num string +) +CLUSTERED BY (num) SORTED BY (num) INTO 4 BUCKETS +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@temp1 +POSTHOOK: query: CREATE TABLE temp1 +( + change string, + num string +) +CLUSTERED BY (num) SORTED BY (num) INTO 4 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@temp1 +PREHOOK: query: explain insert overwrite table temp1 select data, data from bucketinput +PREHOOK: type: QUERY +POSTHOOK: query: explain insert overwrite table temp1 select data, data from bucketinput +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: bucketinput + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: data (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.temp1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.temp1 + + Stage: Stage-2 + Stats-Aggr Operator + PREHOOK: query: select * from bucketoutput1 a join bucketoutput2 b on (a.data=b.data) PREHOOK: type: QUERY PREHOOK: Input: default@bucketoutput1 @@ -90,6 +160,14 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@bucketoutput1 POSTHOOK: Input: default@bucketoutput2 #### A masked pattern was here #### +PREHOOK: query: drop table temp1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@temp1 +PREHOOK: Output: default@temp1 +POSTHOOK: query: drop table temp1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@temp1 +POSTHOOK: Output: default@temp1 PREHOOK: query: drop table buckettestinput PREHOOK: type: DROPTABLE POSTHOOK: query: drop table buckettestinput -- 1.7.12.4 (Apple Git-37)