diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index edea129579..5f2058ebc6 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -3038,7 +3038,7 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Enable (configurable) deprecated behaviors by setting desired level of backward compatibility.\n" + "Setting to 0.12:\n" + " Maintains division behavior: int / int = double"), - HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ("hive.convert.join.bucket.mapjoin.tez", false, + HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ("hive.convert.join.bucket.mapjoin.tez", true, "Whether joins can be automatically converted to bucket map joins in hive \n" + "when tez is used as the execution engine."), HIVE_TEZ_BMJ_USE_SUBCACHE("hive.tez.bmj.use.subcache", true, diff --git a/ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q b/ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q index b85c4a7aa3..b163498814 100644 --- a/ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q +++ b/ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q @@ -20,6 +20,8 @@ set hive.optimize.bucketmapjoin = true; set hive.optimize.bucketmapjoin.sortedmerge = true; set hive.auto.convert.sortmerge.join.to.mapjoin=false; set hive.auto.convert.sortmerge.join.bigtable.selection.policy = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeBasedBigTableSelectorForAutoSMJ; +--disable hash joins +set hive.auto.convert.join.noconditionaltask.size=1; -- Since size is being used to find the big table, the order of the tables in the join does not matter explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; diff --git a/ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q b/ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q index 08dbf6cd1d..1abdc2cae8 100644 --- a/ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q +++ b/ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q @@ -21,7 +21,7 @@ set hive.optimize.bucketmapjoin.sortedmerge = true; set hive.auto.convert.sortmerge.join=true; set hive.auto.convert.sortmerge.join.to.mapjoin=false; --disable hash joins -set hive.auto.convert.join.noconditionaltask.size=10; +set hive.auto.convert.join.noconditionaltask.size=1; -- The join is being performed as part of sub-query. It should be converted to a sort-merge join explain diff --git a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_5.q.out b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_5.q.out index e711715aa5..bbe1252b64 100644 --- a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_5.q.out +++ b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_5.q.out @@ -509,14 +509,13 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 1 <- Map 3 (BROADCAST_EDGE) Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator @@ -527,56 +526,30 @@ STAGE PLANS: expressions: key (type: string) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE - Map Join Operator - condition map: - Inner Join 0 to 1 - Estimated key counts: Map 3 => 1 - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - input vertices: - 1 Map 3 - Position of Big Table: 0 - Statistics: Num rows: 1 Data size: 202 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - null sort order: - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Execution mode: llap - LLAP IO: no inputs Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: bucket_big input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: SORTBUCKETCOLSPREFIX TRUE - bucket_count 2 + bucket_count 4 bucket_field_name key column.name.delimiter , columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.bucket_big - numFiles 2 + name default.bucket_small + numFiles 4 numRows 0 rawDataSize 0 - serialization.ddl struct bucket_big { string key, string value} + serialization.ddl struct bucket_small { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 226 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -584,31 +557,30 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: SORTBUCKETCOLSPREFIX TRUE - bucket_count 2 + bucket_count 4 bucket_field_name key column.name.delimiter , columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.bucket_big - numFiles 2 + name default.bucket_small + numFiles 4 numRows 0 rawDataSize 0 - serialization.ddl struct bucket_big { string key, string value} + serialization.ddl struct bucket_small { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 226 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucket_big - name: default.bucket_big + name: default.bucket_small + name: default.bucket_small Truncated Path -> Alias: - /bucket_big [a] - Map 3 + /bucket_small [b] Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator @@ -619,41 +591,51 @@ STAGE PLANS: expressions: key (type: string) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - null sort order: a - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: true + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Position of Big Table: 0 + Statistics: Num rows: 1 Data size: 202 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false Execution mode: llap - LLAP IO: no inputs Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: bucket_small input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: SORTBUCKETCOLSPREFIX TRUE - bucket_count 4 + bucket_count 2 bucket_field_name key column.name.delimiter , columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.bucket_small - numFiles 4 + name default.bucket_big + numFiles 2 numRows 0 rawDataSize 0 - serialization.ddl struct bucket_small { string key, string value} + serialization.ddl struct bucket_big { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 226 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -661,27 +643,27 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: SORTBUCKETCOLSPREFIX TRUE - bucket_count 4 + bucket_count 2 bucket_field_name key column.name.delimiter , columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.bucket_small - numFiles 4 + name default.bucket_big + numFiles 2 numRows 0 rawDataSize 0 - serialization.ddl struct bucket_small { string key, string value} + serialization.ddl struct bucket_big { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 226 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucket_small - name: default.bucket_small + name: default.bucket_big + name: default.bucket_big Truncated Path -> Alias: - /bucket_small [b] + /bucket_big [a] Reducer 2 Execution mode: llap Needs Tagging: false diff --git a/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_5.q.out b/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_5.q.out index 8250eca099..15080445f5 100644 --- a/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_5.q.out +++ b/ql/src/test/results/clientpositive/spark/auto_sortmerge_join_5.q.out @@ -369,95 +369,10 @@ PREHOOK: type: QUERY POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-2 is a root stage - Stage-1 depends on stages: Stage-2 + Stage-1 is a root stage Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-2 - Spark -#### A masked pattern was here #### - Vertices: - Map 3 - Map Operator Tree: - TableScan - alias: b - Statistics: Num rows: 1 Data size: 2260 Basic stats: COMPLETE Column stats: NONE - GatherStats: false - Filter Operator - isSamplingPred: false - predicate: key is not null (type: boolean) - Statistics: Num rows: 1 Data size: 2260 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: key (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 2260 Basic stats: COMPLETE Column stats: NONE - Spark HashTable Sink Operator - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - Position of Big Table: 0 - Local Work: - Map Reduce Local Work - Bucket Mapjoin Context: - Alias Bucket File Name Mapping: -#### A masked pattern was here #### - Alias Bucket Output File Name Mapping: -#### A masked pattern was here #### - Path -> Alias: -#### A masked pattern was here #### - Path -> Partition: -#### A masked pattern was here #### - Partition - base file name: bucket_small - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - SORTBUCKETCOLSPREFIX TRUE - bucket_count 4 - bucket_field_name key - column.name.delimiter , - columns key,value - columns.comments - columns.types string:string -#### A masked pattern was here #### - name default.bucket_small - numFiles 4 - numRows 0 - rawDataSize 0 - serialization.ddl struct bucket_small { string key, string value} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 226 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - SORTBUCKETCOLSPREFIX TRUE - bucket_count 4 - bucket_field_name key - column.name.delimiter , - columns key,value - columns.comments - columns.types string:string -#### A masked pattern was here #### - name default.bucket_small - numFiles 4 - numRows 0 - rawDataSize 0 - serialization.ddl struct bucket_small { string key, string value} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 226 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucket_small - name: default.bucket_small - Truncated Path -> Alias: - /bucket_small [$hdt$_1:b] - Stage: Stage-1 Spark Edges: @@ -478,14 +393,12 @@ STAGE PLANS: expressions: key (type: string) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 27500 Basic stats: COMPLETE Column stats: NONE - Map Join Operator + Sorted Merge Bucket Map Join Operator condition map: Inner Join 0 to 1 keys: 0 _col0 (type: string) 1 _col0 (type: string) - input vertices: - 1 Map 3 Position of Big Table: 0 Statistics: Num rows: 1 Data size: 30250 Basic stats: COMPLETE Column stats: NONE BucketMapJoin: true @@ -501,13 +414,6 @@ STAGE PLANS: tag: -1 value expressions: _col0 (type: bigint) auto parallelism: false - Local Work: - Map Reduce Local Work - Bucket Mapjoin Context: - Alias Bucket File Name Mapping: -#### A masked pattern was here #### - Alias Bucket Output File Name Mapping: -#### A masked pattern was here #### Path -> Alias: #### A masked pattern was here #### Path -> Partition: