diff --git ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java index 12e9334..3cfc4d3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java +++ ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java @@ -360,6 +360,7 @@ CANNOT_REPLACE_COLUMNS(10243, "Replace columns is not supported for table {0}. SerDe may be incompatible.", true), BAD_LOCATION_VALUE(10244, "{0} is not absolute or has no scheme information. Please specify a complete absolute uri with scheme information."), UNSUPPORTED_ALTER_TBL_OP(10245, "{0} alter table options is not supported"), + INVALID_BIGTABLE_MAPJOIN(10246, "{0} table chosen for streaming is not valid", true), SCRIPT_INIT_ERROR(20000, "Unable to initialize custom script."), SCRIPT_IO_ERROR(20001, "An error occurred while reading or writing to your custom script. " @@ -616,8 +617,8 @@ public String format(String reason) { return format(new String[]{reason}); } /** - * If the message is parametrized, this will fill the parameters with supplied - * {@code reasons}, otherwise {@code reasons} are appended at the end of the + * If the message is parametrized, this will fill the parameters with supplied + * {@code reasons}, otherwise {@code reasons} are appended at the end of the * message. */ public String format(String... reasons) { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java index cc9de54..cd4b3f0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java @@ -29,6 +29,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.DummyStoreOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; @@ -50,6 +51,8 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; @@ -124,8 +127,11 @@ protected boolean canConvertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, } if (!tableEligibleForBucketedSortMergeJoin) { // this is a mapjoin but not suited for a sort merge bucket map join. check outer joins - MapJoinProcessor.checkMapJoin(mapJoinOp.getConf().getPosBigTable(), - mapJoinOp.getConf().getConds()); + if (MapJoinProcessor.checkMapJoin(mapJoinOp.getConf().getPosBigTable(), + mapJoinOp.getConf().getConds()) < 0) { + throw new SemanticException( + ErrorMsg.INVALID_BIGTABLE_MAPJOIN.format(mapJoinOp.getConf().getBigTableAlias())); + } return false; } @@ -470,8 +476,16 @@ protected boolean canConvertJoinToBucketMapJoin( BigTableSelectorForAutoSMJ bigTableMatcher = (BigTableSelectorForAutoSMJ) ReflectionUtils.newInstance(bigTableMatcherClass, null); + JoinDesc joinDesc = joinOp.getConf(); + JoinCondDesc[] joinCondns = joinDesc.getConds(); + List joinCandidates = MapJoinProcessor.getBigTableCandidates(joinCondns); + if (joinCandidates == null) { + // This is a full outer join. This can never be a map-join + // of any type. So return false. + return false; + } int bigTablePosition = - bigTableMatcher.getBigTablePosition(pGraphContext, joinOp); + bigTableMatcher.getBigTablePosition(pGraphContext, joinOp, joinCandidates); if (bigTablePosition < 0) { // contains aliases from sub-query return false; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeBasedBigTableSelectorForAutoSMJ.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeBasedBigTableSelectorForAutoSMJ.java index 5320143..cdb5e76 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeBasedBigTableSelectorForAutoSMJ.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeBasedBigTableSelectorForAutoSMJ.java @@ -45,9 +45,10 @@ private static final Log LOG = LogFactory .getLog(AvgPartitionSizeBasedBigTableSelectorForAutoSMJ.class.getName()); - public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) + public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp, + List bigTableCandidates) throws SemanticException { - int bigTablePos = 0; + int bigTablePos = -1; long maxSize = -1; int numPartitionsCurrentBigTable = 0; // number of partitions for the chosen big table HiveConf conf = parseCtx.getConf(); @@ -57,9 +58,16 @@ public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) getListTopOps(joinOp, topOps); int currentPos = 0; for (TableScanOperator topOp : topOps) { + if (topOp == null) { return -1; } + + if (!bigTableCandidates.contains(currentPos)) { + currentPos++; + continue; + } + int numPartitions = 1; // in case the sizes match, preference is // given to the table with fewer partitions Table table = parseCtx.getTopToTable().get(topOp); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/BigTableSelectorForAutoSMJ.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/BigTableSelectorForAutoSMJ.java index db5ff0f..1731b75 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BigTableSelectorForAutoSMJ.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BigTableSelectorForAutoSMJ.java @@ -18,16 +18,19 @@ package org.apache.hadoop.hive.ql.optimizer; +import java.util.List; + import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; /* - * This is a pluggable policy to chose the candidate map-join table for converting a join to a - * sort merge join. The policy can decide the big table position. Some of the existing polocies + * This is a plug-able policy to chose the candidate map-join table for converting a join to a + * sort merge join. The policy can decide the big table position. Some of the existing policies * decide the big table based on size or position of the tables. */ public interface BigTableSelectorForAutoSMJ { - public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) + public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp, + List joinCandidates) throws SemanticException; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftmostBigTableSelectorForAutoSMJ.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftmostBigTableSelectorForAutoSMJ.java index db3c9e7..67cd8f8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftmostBigTableSelectorForAutoSMJ.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftmostBigTableSelectorForAutoSMJ.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.optimizer; +import java.util.List; + import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.parse.ParseContext; @@ -26,7 +28,8 @@ * sort merge join. The leftmost table is chosen as the join table. */ public class LeftmostBigTableSelectorForAutoSMJ implements BigTableSelectorForAutoSMJ { - public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) { + public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp, + List bigTableCandidates) { return 0; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index cd1b4ad..308c815 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -328,7 +328,9 @@ public static MapJoinOperator convertMapJoin( Byte[] tagOrder = desc.getTagOrder(); if (!noCheckOuterJoin) { - checkMapJoin(mapJoinPos, condns); + if (checkMapJoin(mapJoinPos, condns) < 0) { + throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg()); + } } RowResolver oldOutputRS = opParseCtxMap.get(op).getRowResolver(); @@ -622,8 +624,8 @@ public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator o * @param condns * @return list of big table candidates */ - public static HashSet getBigTableCandidates(JoinCondDesc[] condns) { - HashSet bigTableCandidates = new HashSet(); + public static List getBigTableCandidates(JoinCondDesc[] condns) { + List bigTableCandidates = new ArrayList(); boolean seenOuterJoin = false; Set seenPostitions = new HashSet(); @@ -677,13 +679,20 @@ public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator o return bigTableCandidates; } - public static void checkMapJoin(int mapJoinPos, JoinCondDesc[] condns) throws SemanticException { - HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(condns); + /** + * @param mapJoinPos the position of big table as determined by either hints or auto conversion. + * @param condns the join conditions + * @return if given mapjoin position is a feasible big table position return same else -1. + * @throws SemanticException if given position is not in the big table candidates. + */ + public static int checkMapJoin(int mapJoinPos, JoinCondDesc[] condns) { + List bigTableCandidates = MapJoinProcessor.getBigTableCandidates(condns); - if (bigTableCandidates == null || !bigTableCandidates.contains(mapJoinPos)) { - throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg()); + // bigTableCandidates can never be null + if (!bigTableCandidates.contains(mapJoinPos)) { + return -1; } - return; + return mapJoinPos; } private void genSelectPlan(ParseContext pctx, MapJoinOperator input) throws SemanticException { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/TableSizeBasedBigTableSelectorForAutoSMJ.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/TableSizeBasedBigTableSelectorForAutoSMJ.java index b882f87..f55e87c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/TableSizeBasedBigTableSelectorForAutoSMJ.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/TableSizeBasedBigTableSelectorForAutoSMJ.java @@ -38,9 +38,10 @@ */ public class TableSizeBasedBigTableSelectorForAutoSMJ extends SizeBasedBigTableSelectorForAutoSMJ implements BigTableSelectorForAutoSMJ { - public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) + public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp, + List bigTableCandidates) throws SemanticException { - int bigTablePos = 0; + int bigTablePos = -1; long maxSize = -1; HiveConf conf = parseCtx.getConf(); @@ -49,9 +50,15 @@ public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) getListTopOps(joinOp, topOps); int currentPos = 0; for (TableScanOperator topOp : topOps) { + if (topOp == null) { return -1; } + + if (!bigTableCandidates.contains(currentPos)) { + currentPos++; + continue; + } Table table = parseCtx.getTopToTable().get(topOp); long currentSize = 0; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java index 3071713..f8b537f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java @@ -166,7 +166,7 @@ private void findPossibleAutoConvertedJoinOperators() throws SemanticException { JoinDesc joinDesc = joinOp.getConf(); Byte[] order = joinDesc.getTagOrder(); int numAliases = order.length; - HashSet bigTableCandidates = + List bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds()); if (bigTableCandidates == null) { continue; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java index e214807..dfc90f7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java @@ -23,7 +23,6 @@ import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -417,7 +416,7 @@ public static boolean cannotConvert(String bigTableAlias, long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize); - HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc + List bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc .getConds()); // no table could be the big table; there is no need to convert diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java index da5115b..50f97c9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java @@ -25,7 +25,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.ObjectPair; @@ -281,14 +280,9 @@ private boolean isEligibleForOptimization(SMBMapJoinOperator originalSMBJoinOp) SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf(); Byte[] order = originalSMBJoinDesc.getTagOrder(); int numAliases = order.length; - Set bigTableCandidates = + List bigTableCandidates = MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds()); - // no table could be the big table; there is no need to convert - if (bigTableCandidates == null) { - return null; - } - HashMap aliasToSize = new HashMap(); Configuration conf = context.getConf(); try { diff --git ql/src/test/queries/clientnegative/auto_sortmerge_join_1.q ql/src/test/queries/clientnegative/auto_sortmerge_join_1.q deleted file mode 100644 index c858254..0000000 --- ql/src/test/queries/clientnegative/auto_sortmerge_join_1.q +++ /dev/null @@ -1,25 +0,0 @@ -set hive.enforce.bucketing = true; -set hive.enforce.sorting = true; -set hive.exec.reducers.max = 1; - -CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; -CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; - -insert overwrite table tbl1 select * from src where key < 20; -insert overwrite table tbl2 select * from src where key < 10; - -set hive.merge.mapfiles=false; -set hive.merge.mapredfiles=false; -set hive.auto.convert.sortmerge.join.to.mapjoin=true; -set hive.auto.convert.sortmerge.join=true; -set hive.optimize.bucketmapjoin = true; -set hive.optimize.bucketmapjoin.sortedmerge = true; -set hive.auto.convert.join=true; - --- Since tbl1 is the bigger table, tbl1 Left Outer Join tbl2 can be performed -explain -select count(*) FROM tbl1 a LEFT OUTER JOIN tbl2 b ON a.key = b.key; - --- Since tbl1 is the bigger table, tbl1 Right Outer Join tbl2 cannot be performed -explain -select count(*) FROM tbl1 a RIGHT OUTER JOIN tbl2 b ON a.key = b.key; diff --git ql/src/test/queries/clientpositive/auto_sortmerge_join_15.q ql/src/test/queries/clientpositive/auto_sortmerge_join_15.q new file mode 100644 index 0000000..c7bcae6 --- /dev/null +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_15.q @@ -0,0 +1,23 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 1; + +CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; +CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +insert overwrite table tbl1 select * from src where key < 20; +insert overwrite table tbl2 select * from src where key < 10; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.auto.convert.sortmerge.join.to.mapjoin=true; +set hive.auto.convert.sortmerge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.auto.convert.join=true; + +explain +select count(*) FROM tbl1 a LEFT OUTER JOIN tbl2 b ON a.key = b.key; + +explain +select count(*) FROM tbl1 a RIGHT OUTER JOIN tbl2 b ON a.key = b.key; diff --git ql/src/test/results/clientnegative/auto_sortmerge_join_1.q.out ql/src/test/results/clientnegative/auto_sortmerge_join_1.q.out deleted file mode 100644 index 0eddb69..0000000 --- ql/src/test/results/clientnegative/auto_sortmerge_join_1.q.out +++ /dev/null @@ -1,184 +0,0 @@ -PREHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS -PREHOOK: type: CREATETABLE -POSTHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: default@tbl1 -PREHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS -PREHOOK: type: CREATETABLE -POSTHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: default@tbl2 -PREHOOK: query: insert overwrite table tbl1 select * from src where key < 20 -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@tbl1 -POSTHOOK: query: insert overwrite table tbl1 select * from src where key < 20 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@tbl1 -POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: insert overwrite table tbl2 select * from src where key < 10 -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@tbl2 -POSTHOOK: query: insert overwrite table tbl2 select * from src where key < 10 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@tbl2 -POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- Since tbl1 is the bigger table, tbl1 Left Outer Join tbl2 can be performed -explain -select count(*) FROM tbl1 a LEFT OUTER JOIN tbl2 b ON a.key = b.key -PREHOOK: type: QUERY -POSTHOOK: query: -- Since tbl1 is the bigger table, tbl1 Left Outer Join tbl2 can be performed -explain -select count(*) FROM tbl1 a LEFT OUTER JOIN tbl2 b ON a.key = b.key -POSTHOOK: type: QUERY -POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) - -STAGE DEPENDENCIES: - Stage-4 is a root stage , consists of Stage-5, Stage-1 - Stage-5 has a backup stage: Stage-1 - Stage-3 depends on stages: Stage-5 - Stage-1 - Stage-0 is a root stage - -STAGE PLANS: - Stage: Stage-4 - Conditional Operator - - Stage: Stage-5 - Map Reduce Local Work - Alias -> Map Local Tables: - b - Fetch Operator - limit: -1 - Alias -> Map Local Operator Tree: - b - TableScan - alias: b - HashTable Sink Operator - condition expressions: - 0 - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 0 - - Stage: Stage-3 - Map Reduce - Alias -> Map Operator Tree: - a - TableScan - alias: a - Map Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 0 - Select Operator - Group By Operator - aggregations: - expr: count() - bucketGroup: false - mode: hash - outputColumnNames: _col0 - Reduce Output Operator - sort order: - tag: -1 - value expressions: - expr: _col0 - type: bigint - Local Work: - Map Reduce Local Work - Reduce Operator Tree: - Group By Operator - aggregations: - expr: count(VALUE._col0) - bucketGroup: false - mode: mergepartial - outputColumnNames: _col0 - Select Operator - expressions: - expr: _col0 - type: bigint - outputColumnNames: _col0 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - - Stage: Stage-1 - Map Reduce - Alias -> Map Operator Tree: - a - TableScan - alias: a - Sorted Merge Bucket Map Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 0 - Select Operator - Group By Operator - aggregations: - expr: count() - bucketGroup: false - mode: hash - outputColumnNames: _col0 - Reduce Output Operator - sort order: - tag: -1 - value expressions: - expr: _col0 - type: bigint - Reduce Operator Tree: - Group By Operator - aggregations: - expr: count(VALUE._col0) - bucketGroup: false - mode: mergepartial - outputColumnNames: _col0 - Select Operator - expressions: - expr: _col0 - type: bigint - outputColumnNames: _col0 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - - Stage: Stage-0 - Fetch Operator - limit: -1 - - -FAILED: SemanticException [Error 10057]: MAPJOIN cannot be performed with OUTER JOIN diff --git ql/src/test/results/clientnegative/smb_bucketmapjoin.q.out ql/src/test/results/clientnegative/smb_bucketmapjoin.q.out index 7a5b8c1..a1035fb 100644 --- ql/src/test/results/clientnegative/smb_bucketmapjoin.q.out +++ ql/src/test/results/clientnegative/smb_bucketmapjoin.q.out @@ -34,4 +34,4 @@ POSTHOOK: Lineage: smb_bucket4_1.key EXPRESSION [(src)src.FieldSchema(name:key, POSTHOOK: Lineage: smb_bucket4_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] POSTHOOK: Lineage: smb_bucket4_2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: smb_bucket4_2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -FAILED: SemanticException [Error 10057]: MAPJOIN cannot be performed with OUTER JOIN +FAILED: SemanticException [Error 10246]: b table chosen for streaming is not valid diff --git ql/src/test/results/clientpositive/auto_sortmerge_join_15.q.out ql/src/test/results/clientpositive/auto_sortmerge_join_15.q.out new file mode 100644 index 0000000..5c70b58 --- /dev/null +++ ql/src/test/results/clientpositive/auto_sortmerge_join_15.q.out @@ -0,0 +1,330 @@ +PREHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl1 +PREHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl2(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl2 +PREHOOK: query: insert overwrite table tbl1 select * from src where key < 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tbl1 +POSTHOOK: query: insert overwrite table tbl1 select * from src where key < 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tbl1 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table tbl2 select * from src where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tbl2 +POSTHOOK: query: insert overwrite table tbl2 select * from src where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tbl2 +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain +select count(*) FROM tbl1 a LEFT OUTER JOIN tbl2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(*) FROM tbl1 a LEFT OUTER JOIN tbl2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage , consists of Stage-5, Stage-1 + Stage-5 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-5 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Conditional Operator + + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Sorted Merge Bucket Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain +select count(*) FROM tbl1 a RIGHT OUTER JOIN tbl2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(*) FROM tbl1 a RIGHT OUTER JOIN tbl2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: tbl1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tbl2.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME tbl1) a) (TOK_TABREF (TOK_TABNAME tbl2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage , consists of Stage-5, Stage-1 + Stage-5 has a backup stage: Stage-1 + Stage-3 depends on stages: Stage-5 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Conditional Operator + + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Sorted Merge Bucket Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + +