diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java index 191cfbadc6..5933328b22 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java @@ -21,6 +21,7 @@ import java.util.*; import java.util.Map.Entry; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; @@ -153,32 +154,34 @@ public boolean checkBucketedTable(Table tbl, ParseContext pGraphContext, return false; } - if (tbl.isPartitioned()) { - List partitions = prunedParts.getNotDeniedPartns(); - // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) - if (!partitions.isEmpty()) { - for (Partition p : partitions) { - List fileNames = - AbstractBucketJoinProc.getBucketFilePathsOfPartition(p.getDataLocation(), - pGraphContext); - // The number of files for the table should be same as number of - // buckets. - if (fileNames.size() != 0 && fileNames.size() != numBuckets) { - return false; + // Tez can handle unpopulated buckets + if (!HiveConf.getVar(pGraphContext.getConf(), HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { + if (tbl.isPartitioned()) { + List partitions = prunedParts.getNotDeniedPartns(); + // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) + if (!partitions.isEmpty()) { + for (Partition p : partitions) { + List fileNames = + AbstractBucketJoinProc.getBucketFilePathsOfPartition(p.getDataLocation(), + pGraphContext); + // The number of files for the table should be same as number of + // buckets. + if (fileNames.size() != 0 && fileNames.size() != numBuckets) { + return false; + } } } - } - } else { - - List fileNames = - AbstractBucketJoinProc.getBucketFilePathsOfPartition(tbl.getDataLocation(), - pGraphContext); - // The number of files for the table should be same as number of buckets. - if (fileNames.size() != 0 && fileNames.size() != numBuckets) { - return false; + } else { + + List fileNames = + AbstractBucketJoinProc.getBucketFilePathsOfPartition(tbl.getDataLocation(), + pGraphContext); + // The number of files for the table should be same as number of buckets. + if (fileNames.size() != 0 && fileNames.size() != numBuckets) { + return false; + } } } - return true; } diff --git a/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q b/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q index 45feec2b63..23c3fb8ab5 100644 --- a/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q +++ b/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q @@ -107,3 +107,14 @@ explain select count(*) from (select distinct key,value from tab_part) a join tab b on a.key = b.key and a.value = b.value; + + +--HIVE-17939 +create table small (i int) stored as ORC; +create table big (i int) partitioned by (k int) clustered by (i) into 10 buckets stored as ORC; + +insert into small values (1),(2),(3),(4),(5),(6); +insert into big partition(k=1) values(1),(3),(5),(7),(9); +insert into big partition(k=2) values(0),(2),(4),(6),(8); +explain select * from small,big where small.i=big.i; +select * from small,big where small.i=big.i; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out index e2cee7fd0c..4b888e9375 100644 --- a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out +++ b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out @@ -1707,3 +1707,133 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: create table small (i int) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@small +POSTHOOK: query: create table small (i int) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@small +PREHOOK: query: create table big (i int) partitioned by (k int) clustered by (i) into 10 buckets stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@big +POSTHOOK: query: create table big (i int) partitioned by (k int) clustered by (i) into 10 buckets stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@big +PREHOOK: query: insert into small values (1),(2),(3),(4),(5),(6) +PREHOOK: type: QUERY +PREHOOK: Output: default@small +POSTHOOK: query: insert into small values (1),(2),(3),(4),(5),(6) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@small +POSTHOOK: Lineage: small.i EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into big partition(k=1) values(1),(3),(5),(7),(9) +PREHOOK: type: QUERY +PREHOOK: Output: default@big@k=1 +POSTHOOK: query: insert into big partition(k=1) values(1),(3),(5),(7),(9) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@big@k=1 +POSTHOOK: Lineage: big PARTITION(k=1).i EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into big partition(k=2) values(0),(2),(4),(6),(8) +PREHOOK: type: QUERY +PREHOOK: Output: default@big@k=2 +POSTHOOK: query: insert into big partition(k=2) values(0),(2),(4),(6),(8) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@big@k=2 +POSTHOOK: Lineage: big PARTITION(k=2).i EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain select * from small,big where small.i=big.i +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from small,big where small.i=big.i +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (CUSTOM_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: small + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: i is not null (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: i (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 2 + Map Operator Tree: + TableScan + alias: big + Statistics: Num rows: 10 Data size: 120 Basic stats: COMPLETE Column stats: PARTIAL + Filter Operator + predicate: i is not null (type: boolean) + Statistics: Num rows: 10 Data size: 80 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: i (type: int), k (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10 Data size: 80 Basic stats: COMPLETE Column stats: PARTIAL + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col2 + input vertices: + 0 Map 1 + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + LLAP IO: all inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from small,big where small.i=big.i +PREHOOK: type: QUERY +PREHOOK: Input: default@big +PREHOOK: Input: default@big@k=1 +PREHOOK: Input: default@big@k=2 +PREHOOK: Input: default@small +#### A masked pattern was here #### +POSTHOOK: query: select * from small,big where small.i=big.i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@big +POSTHOOK: Input: default@big@k=1 +POSTHOOK: Input: default@big@k=2 +POSTHOOK: Input: default@small +#### A masked pattern was here #### +1 1 1 +2 2 2 +3 3 1 +4 4 2 +5 5 1 +6 6 2 diff --git a/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out b/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out index 34837f52dd..ae0ee4ea7f 100644 --- a/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out +++ b/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out @@ -1636,3 +1636,135 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: create table small (i int) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@small +POSTHOOK: query: create table small (i int) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@small +PREHOOK: query: create table big (i int) partitioned by (k int) clustered by (i) into 10 buckets stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@big +POSTHOOK: query: create table big (i int) partitioned by (k int) clustered by (i) into 10 buckets stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@big +PREHOOK: query: insert into small values (1),(2),(3),(4),(5),(6) +PREHOOK: type: QUERY +PREHOOK: Output: default@small +POSTHOOK: query: insert into small values (1),(2),(3),(4),(5),(6) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@small +POSTHOOK: Lineage: small.i EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into big partition(k=1) values(1),(3),(5),(7),(9) +PREHOOK: type: QUERY +PREHOOK: Output: default@big@k=1 +POSTHOOK: query: insert into big partition(k=1) values(1),(3),(5),(7),(9) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@big@k=1 +POSTHOOK: Lineage: big PARTITION(k=1).i EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into big partition(k=2) values(0),(2),(4),(6),(8) +PREHOOK: type: QUERY +PREHOOK: Output: default@big@k=2 +POSTHOOK: query: insert into big partition(k=2) values(0),(2),(4),(6),(8) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@big@k=2 +POSTHOOK: Lineage: big PARTITION(k=2).i EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain select * from small,big where small.i=big.i +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from small,big where small.i=big.i +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: small + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: i is not null (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: i (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: big + Statistics: Num rows: 10 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: i is not null (type: boolean) + Statistics: Num rows: 10 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: i (type: int), k (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col2 + input vertices: + 0 Map 1 + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 44 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from small,big where small.i=big.i +PREHOOK: type: QUERY +PREHOOK: Input: default@big +PREHOOK: Input: default@big@k=1 +PREHOOK: Input: default@big@k=2 +PREHOOK: Input: default@small +#### A masked pattern was here #### +POSTHOOK: query: select * from small,big where small.i=big.i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@big +POSTHOOK: Input: default@big@k=1 +POSTHOOK: Input: default@big@k=2 +POSTHOOK: Input: default@small +#### A masked pattern was here #### +1 1 1 +3 3 1 +5 5 1 +2 2 2 +4 4 2 +6 6 2