Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1455365) +++ conf/hive-default.xml.template (working copy) @@ -810,7 +810,7 @@ hive.auto.convert.join.noconditionaltask - false + true Whether Hive enable the optimization about converting common join into mapjoin based on the input file size. If this paramater is on, and the sum of size for n-1 of the tables/partitions for a n-way join is smaller than the specified size, the join is directly converted to a mapjoin (there is no conditional task). Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1455365) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -471,7 +471,7 @@ HIVESKEWJOIN("hive.optimize.skewjoin", false), HIVECONVERTJOIN("hive.auto.convert.join", true), - HIVECONVERTJOINNOCONDITIONALTASK("hive.auto.convert.join.noconditionaltask", false), + HIVECONVERTJOINNOCONDITIONALTASK("hive.auto.convert.join.noconditionaltask", true), HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD("hive.auto.convert.join.noconditionaltask.size", 10000000L), HIVESKEWJOINKEY("hive.skewjoin.key", 100000), Index: ql/src/test/results/clientpositive/join41.q.out =================================================================== --- ql/src/test/results/clientpositive/join41.q.out (revision 0) +++ ql/src/test/results/clientpositive/join41.q.out (working copy) @@ -0,0 +1,205 @@ +PREHOOK: query: create table s1 as select * from src where key = 0 +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +POSTHOOK: query: create table s1 as select * from src where key = 0 +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: default@s1 +PREHOOK: query: EXPLAIN +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME s1) src1) (TOK_TABREF (TOK_TABNAME s1) src2) (AND (= (. (TOK_TABLE_OR_COL src1) key) (. (TOK_TABLE_OR_COL src2) key)) (> (. (TOK_TABLE_OR_COL src2) key) 10)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src1 + TableScan + alias: src1 + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: value + type: string + src2 + TableScan + alias: src2 + Filter Operator + predicate: + expr: (key > 10.0) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +PREHOOK: type: QUERY +PREHOOK: Input: default@s1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@s1 +#### A masked pattern was here #### +0 val_0 NULL NULL +0 val_0 NULL NULL +0 val_0 NULL NULL +PREHOOK: query: -- Make sure the big table is chosen correctly as part of HIVE-4146 +EXPLAIN +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +PREHOOK: type: QUERY +POSTHOOK: query: -- Make sure the big table is chosen correctly as part of HIVE-4146 +EXPLAIN +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME s1) src1) (TOK_TABREF (TOK_TABNAME s1) src2) (AND (= (. (TOK_TABLE_OR_COL src1) key) (. (TOK_TABLE_OR_COL src2) key)) (> (. (TOK_TABLE_OR_COL src2) key) 10)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src1 + TableScan + alias: src1 + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: value + type: string + src2 + TableScan + alias: src2 + Filter Operator + predicate: + expr: (key > 10.0) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: value + type: string + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +PREHOOK: type: QUERY +PREHOOK: Input: default@s1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@s1 +#### A masked pattern was here #### +0 val_0 NULL NULL +0 val_0 NULL NULL +0 val_0 NULL NULL Index: ql/src/test/queries/clientpositive/join41.q =================================================================== --- ql/src/test/queries/clientpositive/join41.q (revision 0) +++ ql/src/test/queries/clientpositive/join41.q (working copy) @@ -0,0 +1,16 @@ +create table s1 as select * from src where key = 0; + +set hive.auto.convert.join.noconditionaltask=false; +EXPLAIN +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10); +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10); + +set hive.auto.convert.join.noconditionaltask=true; + +-- Make sure the big table is chosen correctly as part of HIVE-4146 +EXPLAIN +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10); +SELECT * FROM s1 src1 LEFT OUTER JOIN s1 src2 ON (src1.key = src2.key AND src2.key > 10); + + + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (revision 1455365) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (working copy) @@ -329,31 +329,35 @@ HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD); boolean bigTableFound = false; - long largestTableSize = 0; + long largestBigTableCandidateSize = 0; long sumTableSizes = 0; for (String alias : aliasToWork.keySet()) { + int tablePosition = getPosition(currWork, joinOp, alias); + boolean bigTableCandidate = bigTableCandidates.contains(tablePosition); Long size = aliasToSize.get(alias); // The size is not available at compile time if the input is a sub-query. // If the size of atleast n-1 inputs for a n-way join are available at compile time, // and the sum of them is less than the specified threshold, then convert the join // into a map-join without the conditional task. if ((size == null) || (size > mapJoinSize)) { - sumTableSizes += largestTableSize; - if (bigTableFound || (sumTableSizes > mapJoinSize)) { + sumTableSizes += largestBigTableCandidateSize; + if (bigTableFound || (sumTableSizes > mapJoinSize) || !bigTableCandidate) { convertJoinMapJoin = false; break; } bigTableFound = true; - bigTablePosition = getPosition(currWork, joinOp, alias); - largestTableSize = mapJoinSize + 1; + bigTablePosition = tablePosition; + largestBigTableCandidateSize = mapJoinSize + 1; } else { - if (size > largestTableSize) { - sumTableSizes += largestTableSize; - largestTableSize = size; - bigTablePosition = getPosition(currWork, joinOp, alias); - } else { + if (bigTableCandidate && size > largestBigTableCandidateSize) { + bigTablePosition = tablePosition; + sumTableSizes += largestBigTableCandidateSize; + largestBigTableCandidateSize = size; + } + else { sumTableSizes += size; } + if (sumTableSizes > mapJoinSize) { convertJoinMapJoin = false; break;