diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java index 0b7b1a3..479b4d7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java @@ -371,8 +371,10 @@ private boolean checkSortColsAndJoinCols(List sortCols, for (int pos = 0; pos < sortCols.size(); pos++) { Order o = sortCols.get(pos); - if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) { - return false; + if (pos < sortColumnsFirstPartition.size()) { + if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) { + return false; + } } sortColNames.add(o.getCol()); } diff --git ql/src/test/queries/clientpositive/sort_merge_join_desc_8.q ql/src/test/queries/clientpositive/sort_merge_join_desc_8.q new file mode 100644 index 0000000..2ec0849 --- /dev/null +++ ql/src/test/queries/clientpositive/sort_merge_join_desc_8.q @@ -0,0 +1,42 @@ +drop table table_desc1; +drop table table_desc2; +drop table table_desc3; +drop table table_desc4; + +set hive.enforce.sorting = true; + +create table table_desc1(key string, value string) clustered by (key) +sorted by (key DESC) into 1 BUCKETS; +create table table_desc2(key string, value string) clustered by (key) +sorted by (key DESC, value DESC) into 1 BUCKETS; +create table table_desc3(key string, value1 string, value2 string) clustered by (key) +sorted by (key DESC, value1 DESC,value2 DESC) into 1 BUCKETS; +create table table_desc4(key string, value2 string) clustered by (key) +sorted by (key DESC, value2 DESC) into 1 BUCKETS; + +insert overwrite table table_desc1 select key, value from src sort by key DESC; +insert overwrite table table_desc2 select key, value from src sort by key DESC; +insert overwrite table table_desc3 select key, value, concat(value,"_2") as value2 from src sort by key, value, value2 DESC; +insert overwrite table table_desc4 select key, concat(value,"_2") as value2 from src sort by key, value2 DESC; + +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; + +-- columns are sorted by one key in first table, two keys in second table but in same sort order for key. Hence SMB join should pass + +explain +select /*+ mapjoin(b) */ count(*) from table_desc1 a join table_desc2 b +on a.key=b.key where a.key < 10; + +select /*+ mapjoin(b) */ count(*) from table_desc1 a join table_desc2 b +on a.key=b.key where a.key < 10; + +-- columns are sorted by 3 keys(a, b, c) in first table, two keys(a, c) in second table with same sort order. Hence SMB join should not pass + +explain +select /*+ mapjoin(b) */ count(*) from table_desc3 a join table_desc4 b +on a.key=b.key and a.value2=b.value2 where a.key < 10; + +select /*+ mapjoin(b) */ count(*) from table_desc3 a join table_desc4 b +on a.key=b.key and a.value2=b.value2 where a.key < 10; diff --git ql/src/test/results/clientpositive/sort_merge_join_desc_8.q.out ql/src/test/results/clientpositive/sort_merge_join_desc_8.q.out new file mode 100644 index 0000000..d0fcc42 --- /dev/null +++ ql/src/test/results/clientpositive/sort_merge_join_desc_8.q.out @@ -0,0 +1,279 @@ +PREHOOK: query: drop table table_desc1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table table_desc1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table table_desc2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table table_desc2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table table_desc3 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table table_desc3 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table table_desc4 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table table_desc4 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table table_desc1(key string, value string) clustered by (key) +sorted by (key DESC) into 1 BUCKETS +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table table_desc1(key string, value string) clustered by (key) +sorted by (key DESC) into 1 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_desc1 +PREHOOK: query: create table table_desc2(key string, value string) clustered by (key) +sorted by (key DESC, value DESC) into 1 BUCKETS +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table table_desc2(key string, value string) clustered by (key) +sorted by (key DESC, value DESC) into 1 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_desc2 +PREHOOK: query: create table table_desc3(key string, value1 string, value2 string) clustered by (key) +sorted by (key DESC, value1 DESC,value2 DESC) into 1 BUCKETS +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table table_desc3(key string, value1 string, value2 string) clustered by (key) +sorted by (key DESC, value1 DESC,value2 DESC) into 1 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_desc3 +PREHOOK: query: create table table_desc4(key string, value2 string) clustered by (key) +sorted by (key DESC, value2 DESC) into 1 BUCKETS +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table table_desc4(key string, value2 string) clustered by (key) +sorted by (key DESC, value2 DESC) into 1 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_desc4 +PREHOOK: query: insert overwrite table table_desc1 select key, value from src sort by key DESC +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@table_desc1 +POSTHOOK: query: insert overwrite table table_desc1 select key, value from src sort by key DESC +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@table_desc1 +POSTHOOK: Lineage: table_desc1.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: table_desc1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table table_desc2 select key, value from src sort by key DESC +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@table_desc2 +POSTHOOK: query: insert overwrite table table_desc2 select key, value from src sort by key DESC +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@table_desc2 +POSTHOOK: Lineage: table_desc2.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: table_desc2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table table_desc3 select key, value, concat(value,"_2") as value2 from src sort by key, value, value2 DESC +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@table_desc3 +POSTHOOK: query: insert overwrite table table_desc3 select key, value, concat(value,"_2") as value2 from src sort by key, value, value2 DESC +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@table_desc3 +POSTHOOK: Lineage: table_desc3.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: table_desc3.value1 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: table_desc3.value2 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table table_desc4 select key, concat(value,"_2") as value2 from src sort by key, value2 DESC +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@table_desc4 +POSTHOOK: query: insert overwrite table table_desc4 select key, concat(value,"_2") as value2 from src sort by key, value2 DESC +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@table_desc4 +POSTHOOK: Lineage: table_desc4.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: table_desc4.value2 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- columns are sorted by one key in first table, two keys in second table but in same sort order for key. Hence SMB join should pass + +explain +select /*+ mapjoin(b) */ count(*) from table_desc1 a join table_desc2 b +on a.key=b.key where a.key < 10 +PREHOOK: type: QUERY +POSTHOOK: query: -- columns are sorted by one key in first table, two keys in second table but in same sort order for key. Hence SMB join should pass + +explain +select /*+ mapjoin(b) */ count(*) from table_desc1 a join table_desc2 b +on a.key=b.key where a.key < 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (key < 10)) (type: boolean) + Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Select Operator + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select /*+ mapjoin(b) */ count(*) from table_desc1 a join table_desc2 b +on a.key=b.key where a.key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@table_desc1 +PREHOOK: Input: default@table_desc2 +#### A masked pattern was here #### +POSTHOOK: query: select /*+ mapjoin(b) */ count(*) from table_desc1 a join table_desc2 b +on a.key=b.key where a.key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_desc1 +POSTHOOK: Input: default@table_desc2 +#### A masked pattern was here #### +22 +PREHOOK: query: -- columns are sorted by 3 keys(a, b, c) in first table, two keys(a, c) in second table with same sort order. Hence SMB join should not pass + +explain +select /*+ mapjoin(b) */ count(*) from table_desc3 a join table_desc4 b +on a.key=b.key and a.value2=b.value2 where a.key < 10 +PREHOOK: type: QUERY +POSTHOOK: query: -- columns are sorted by 3 keys(a, b, c) in first table, two keys(a, c) in second table with same sort order. Hence SMB join should not pass + +explain +select /*+ mapjoin(b) */ count(*) from table_desc3 a join table_desc4 b +on a.key=b.key and a.value2=b.value2 where a.key < 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-3 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + Statistics: Num rows: 500 Data size: 6312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key is not null and value2 is not null) and (key < 10)) (type: boolean) + Statistics: Num rows: 41 Data size: 517 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string), value2 (type: string) + 1 key (type: string), value2 (type: string) + + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 500 Data size: 10218 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key is not null and value2 is not null) and (key < 10)) (type: boolean) + Statistics: Num rows: 41 Data size: 837 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string), value2 (type: string) + 1 key (type: string), value2 (type: string) + Statistics: Num rows: 45 Data size: 920 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 45 Data size: 920 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select /*+ mapjoin(b) */ count(*) from table_desc3 a join table_desc4 b +on a.key=b.key and a.value2=b.value2 where a.key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@table_desc3 +PREHOOK: Input: default@table_desc4 +#### A masked pattern was here #### +POSTHOOK: query: select /*+ mapjoin(b) */ count(*) from table_desc3 a join table_desc4 b +on a.key=b.key and a.value2=b.value2 where a.key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_desc3 +POSTHOOK: Input: default@table_desc4 +#### A masked pattern was here #### +22