Index: ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out =================================================================== --- ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out (revision 0) +++ ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out (working copy) @@ -0,0 +1,45 @@ +PREHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table +-- do not match the number of files +drop table table1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table +-- do not match the number of files +drop table table1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table table2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table table2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table table1(key string, value string) clustered by (key, value) +into 2 BUCKETS stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table table1(key string, value string) clustered by (key, value) +into 2 BUCKETS stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@table1 +PREHOOK: query: create table table2(key string, value string) clustered by (value, key) +into 2 BUCKETS stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table table2(key string, value string) clustered by (value, key) +into 2 BUCKETS stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@table2 +PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 +PREHOOK: type: LOAD +PREHOOK: Output: default@table1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2 +PREHOOK: type: LOAD +PREHOOK: Output: default@table2 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table2 +PREHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2 +PREHOOK: type: LOAD +PREHOOK: Output: default@table2 +POSTHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table2 +FAILED: SemanticException [Error 10141]: Bucketed table metadata is not correct. Fix the metadata or don't use bucketed mapjoin, by setting hive.enforce.bucketmapjoin to false. The number of buckets for table table1 is 2, whereas the number of files is 1 Index: ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out =================================================================== --- ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out (revision 0) +++ ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out (working copy) @@ -0,0 +1,59 @@ +PREHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table +-- do not match the number of files +drop table table1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table +-- do not match the number of files +drop table table1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table table2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table table2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table table1(key string, value string) partitioned by (ds string) clustered by (key, value) +into 2 BUCKETS stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table table1(key string, value string) partitioned by (ds string) clustered by (key, value) +into 2 BUCKETS stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@table1 +PREHOOK: query: create table table2(key string, value string) clustered by (value, key) +into 2 BUCKETS stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table table2(key string, value string) clustered by (value, key) +into 2 BUCKETS stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@table2 +PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='1') +PREHOOK: type: LOAD +PREHOOK: Output: default@table1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='1') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table1 +POSTHOOK: Output: default@table1@ds=1 +PREHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table1 partition (ds='1') +PREHOOK: type: LOAD +PREHOOK: Output: default@table1@ds=1 +POSTHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table1 partition (ds='1') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table1@ds=1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='2') +PREHOOK: type: LOAD +PREHOOK: Output: default@table1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='2') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table1 +POSTHOOK: Output: default@table1@ds=2 +PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2 +PREHOOK: type: LOAD +PREHOOK: Output: default@table2 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table2 +PREHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2 +PREHOOK: type: LOAD +PREHOOK: Output: default@table2 +POSTHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@table2 +FAILED: SemanticException [Error 10141]: Bucketed table metadata is not correct. Fix the metadata or don't use bucketed mapjoin, by setting hive.enforce.bucketmapjoin to false. The number of buckets for table table1 partition ds=1 is 2, whereas the number of files is 1 Index: ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q =================================================================== --- ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q (revision 0) +++ ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q (working copy) @@ -0,0 +1,20 @@ +-- Although the user has specified a bucketed map-join, the number of buckets in the table +-- do not match the number of files +drop table table1; +drop table table2; + +create table table1(key string, value string) clustered by (key, value) +into 2 BUCKETS stored as textfile; +create table table2(key string, value string) clustered by (value, key) +into 2 BUCKETS stored as textfile; + +load data local inpath '../data/files/T1.txt' overwrite into table table1; + +load data local inpath '../data/files/T1.txt' overwrite into table table2; +load data local inpath '../data/files/T2.txt' overwrite into table table2; + +set hive.optimize.bucketmapjoin = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; + +select /*+ mapjoin(b) */ count(*) from table1 a join table2 b on a.key=b.key and a.value=b.value; + Index: ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q =================================================================== --- ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q (revision 0) +++ ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q (working copy) @@ -0,0 +1,24 @@ +-- Although the user has specified a bucketed map-join, the number of buckets in the table +-- do not match the number of files +drop table table1; +drop table table2; + +create table table1(key string, value string) partitioned by (ds string) clustered by (key, value) +into 2 BUCKETS stored as textfile; +create table table2(key string, value string) clustered by (value, key) +into 2 BUCKETS stored as textfile; + +load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='1'); +load data local inpath '../data/files/T2.txt' overwrite into table table1 partition (ds='1'); + +load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='2'); + +load data local inpath '../data/files/T1.txt' overwrite into table table2; +load data local inpath '../data/files/T2.txt' overwrite into table table2; + +set hive.optimize.bucketmapjoin = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; + +select /*+ mapjoin(b) */ count(*) from table1 a join table2 b +on a.key=b.key and a.value=b.value and a.ds is not null; + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1373960) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy) @@ -234,12 +234,21 @@ return false; } List fileNames = getOnePartitionBucketFileNames(p.getDataLocation()); + // The number of files for the table should be same as number of buckets. + int bucketCount = p.getBucketCount(); + if (fileNames.size() != bucketCount) { + String msg = "The number of buckets for table " + + tbl.getTableName() + " partition " + p.getName() + " is " + + p.getBucketCount() + ", whereas the number of files is " + fileNames.size(); + throw new SemanticException( + ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); + } if (alias.equals(baseBigAlias)) { bigTblPartsToBucketFileNames.put(p, fileNames); - bigTblPartsToBucketNumber.put(p, p.getBucketCount()); + bigTblPartsToBucketNumber.put(p, bucketCount); } else { files.add(fileNames); - buckets.add(p.getBucketCount()); + buckets.add(bucketCount); } } if (!alias.equals(baseBigAlias)) { @@ -253,6 +262,14 @@ } List fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation()); Integer num = new Integer(tbl.getNumBuckets()); + // The number of files for the table should be same as number of buckets. + if (fileNames.size() != num) { + String msg = "The number of buckets for table " + + tbl.getTableName() + " is " + tbl.getNumBuckets() + + ", whereas the number of files is " + fileNames.size(); + throw new SemanticException( + ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); + } if (alias.equals(baseBigAlias)) { bigTblPartsToBucketFileNames.put(null, fileNames); bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets()); Index: ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java (revision 1373960) +++ ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java (working copy) @@ -246,6 +246,11 @@ EXPRESSIONS_NOT_ALLOWED_SORTBY(10140, "Expressions are not allowed in a sort by clause. Use a column alias instead"), + BUCKETED_TABLE_METADATA_INCORRECT(10141, + "Bucketed table metadata is not correct. " + + "Fix the metadata or don't use bucketed mapjoin, by setting " + + "hive.enforce.bucketmapjoin to false."), + SCRIPT_INIT_ERROR(20000, "Unable to initialize custom script."), SCRIPT_IO_ERROR(20001, "An error occurred while reading or writing to your custom script. " + "It may have crashed with an error."),