diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java index 66b84ff..42b10a8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java @@ -1,9 +1,4 @@ /** - <<<<<<< HEAD - ======= - * Copyright 2010 The Apache Software Foundation - * - >>>>>>> HIVE-1402 [jira] Add parallel ORDER BY to Hive * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -30,30 +25,30 @@ public class OperatorUtils { public static Set findOperators(Operator start, Class clazz) { - return findOperator(start, clazz, new HashSet()); + return findOperators(start, clazz, new HashSet()); } public static T findSingleOperator(Operator start, Class clazz) { - Set found = findOperator(start, clazz, new HashSet()); + Set found = findOperators(start, clazz, new HashSet()); return found.size() == 1 ? found.iterator().next() : null; } public static Set findOperators(Collection> starts, Class clazz) { Set found = new HashSet(); for (Operator start : starts) { - findOperator(start, clazz, found); + findOperators(start, clazz, found); } return found; } @SuppressWarnings("unchecked") - private static Set findOperator(Operator start, Class clazz, Set found) { + private static Set findOperators(Operator start, Class clazz, Set found) { if (clazz.isInstance(start)) { found.add((T) start); } if (start.getChildOperators() != null) { for (Operator child : start.getChildOperators()) { - findOperator(child, clazz, found); + findOperators(child, clazz, found); } } return found; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java index f98878c..31fb767 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java @@ -26,6 +26,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.ObjectPair; @@ -35,6 +36,7 @@ import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; @@ -107,6 +109,65 @@ public CommonJoinTaskDispatcher(PhysicalContext context) { super(context); } + /** + * Calculate the total size of local tables in loclWork. + * @param localWork + * @return the total size of local tables. Or -1, if the total + * size is unknown. + */ + private long calculateLocalTableTotalSize(MapredLocalWork localWork) { + long localTableTotalSize = 0; + for (String alias : localWork.getAliasToWork().keySet()) { + Long tabSize = aliasToSize.get(alias); + if (tabSize == null) { + // if the size is unavailable, we need to assume a size 1 greater than + // localTableTotalSizeLimit this implies that merge cannot happen + // so we will return false. + return -1; + } + localTableTotalSize += tabSize; + } + return localTableTotalSize; + } + + /** + * Check if the total size of local tables will be under + * the limit after we merge localWork1 and localWork2. + * The limit of the total size of local tables is defined by + * HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD. + * @param conf + * @param localWork1 + * @param localWork2 + * @return + */ + private boolean isLocalTableTotalSizeUnderLimitAfterMerge( + Configuration conf, + MapredLocalWork localWork1, MapredLocalWork localWork2) { + final long localTableTotalSizeLimit = HiveConf.getLongVar(conf, + HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD); + + final long localWork1TableTotalSize = calculateLocalTableTotalSize(localWork1); + if (localWork1TableTotalSize < 0) { + // The total size of local tables in localWork1 is unknown. + return false; + } + + final long localWork2TableTotalSize = calculateLocalTableTotalSize(localWork2); + if (localWork2TableTotalSize < 0) { + // The total size of local tables in localWork2 is unknown. + return false; + } + + if (localWork1TableTotalSize + localWork2TableTotalSize > localTableTotalSizeLimit) { + // The total size of local tables after we merge localWork1 and localWork2 + // is larger than the limit set by + // HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD. + return false; + } + + return true; + } + // Get the position of the big table for this join operator and the given alias private int getPosition(MapredWork work, Operator joinOp, String alias) { @@ -175,34 +236,11 @@ private void mergeMapJoinTaskWithChildMapJoinTask(MapRedTask task, Configuration return; } - long mapJoinSize = HiveConf.getLongVar(conf, - HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD); - long localTableTotalSize = 0; - for (String alias : localWork.getAliasToWork().keySet()) { - Long tabSize = aliasToSize.get(alias); - if (tabSize == null) { - /* - * if the size is unavailable, we need to assume a size 1 greater than mapJoinSize - * this implies that merge cannot happen so we can return. - */ - return; - } - localTableTotalSize += tabSize; - } - - for (String alias : childLocalWork.getAliasToWork().keySet()) { - Long tabSize = aliasToSize.get(alias); - if (tabSize == null) { - /* - * if the size is unavailable, we need to assume a size 1 greater than mapJoinSize - * this implies that merge cannot happen so we can return. - */ - return; - } - localTableTotalSize += tabSize; - if (localTableTotalSize > mapJoinSize) { - return; - } + if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, localWork, childLocalWork)){ + // The total size of local tables may not be under + // the limit after we merge localWork and childLocalWork. + // Do not merge. + return; } Operator childAliasOp = @@ -290,130 +328,152 @@ private void copyReducerConf(MapRedTask task, MapRedTask childTask) { * A task and its child task has been converted from join to mapjoin. * See if the two tasks can be merged. */ - private void mergeMapJoinTaskWithMapReduceTask(MapRedTask mapJoinTask, Configuration conf) { + private void mergeMapJoinTaskWithMapReduceTask(MapRedTask mapJoinTask, Configuration conf) + throws SemanticException{ + // Step 1: Check if mapJoinTask has a single child. + // If so, check if we can merge mapJoinTask into that child. if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) { // No child-task to merge, nothing to do or there are more than one // child-tasks in which case we don't want to do anything. return; } - Task firstChildTask = mapJoinTask.getChildTasks().get(0); - if (!(firstChildTask instanceof MapRedTask)) { - // Nothing to do if it is not a mapreduce task. - return; - } - MapRedTask childTask = (MapRedTask) firstChildTask; - MapredWork mapJoinWork = mapJoinTask.getWork(); - MapredWork childWork = childTask.getWork(); - Operator childReducer = childWork.getReducer(); - if (childReducer == null) { - // Not a MR job, nothing to merge. - return; - } - // Can this be merged - Map> aliasToWork = mapJoinWork.getAliasToWork(); - if (aliasToWork.size() > 1) { - return; - } - Map> childPathToAliases = childWork.getPathToAliases(); - if (childPathToAliases.size() > 1) { + Task childTask = mapJoinTask.getChildTasks().get(0); + if (!(childTask instanceof MapRedTask)) { + // Nothing to do if it is not a MapReduce task. return; } - // Locate leaf operator of the map-join task. Start by initializing leaf - // operator to be root operator. - Operator mapJoinLeafOperator = aliasToWork.values().iterator().next(); - while (mapJoinLeafOperator.getChildOperators() != null) { - // Dont perform this optimization for multi-table inserts - if (mapJoinLeafOperator.getChildOperators().size() > 1) { - return; - } - mapJoinLeafOperator = mapJoinLeafOperator.getChildOperators().get(0); - } + MapRedTask childMRTask = (MapRedTask) childTask; + MapredWork mapJoinWork = mapJoinTask.getWork(); + MapredWork childMRWork = childMRTask.getWork(); - assert (mapJoinLeafOperator instanceof FileSinkOperator); - if (!(mapJoinLeafOperator instanceof FileSinkOperator)) { - // Sanity check, shouldn't happen. + Map> mapJoinPathToAliases = mapJoinWork.getPathToAliases(); + Map> mapJoinAliasToWork = + mapJoinWork.getAliasToWork(); + if (mapJoinPathToAliases.size() > 1 || mapJoinAliasToWork.size() > 1) { + // Do not merge if the MapredWork of MapJoin has multiple input aliases. return; } - FileSinkOperator mapJoinTaskFileSinkOperator = (FileSinkOperator) mapJoinLeafOperator; - - // The filesink writes to a different directory - String workDir = mapJoinTaskFileSinkOperator.getConf().getDirName(); - if (!childPathToAliases.keySet().iterator().next().equals(workDir)) { + Entry> mapJoinPathToAliasesEntry = + mapJoinPathToAliases.entrySet().iterator().next(); + String mapJoinPath = mapJoinPathToAliasesEntry.getKey(); + if (mapJoinPathToAliasesEntry.getValue().size() != 1) { + throw new SemanticException("Expected only 1 alias assigned to path " + mapJoinPath + + ". Found " + mapJoinPathToAliasesEntry.getValue().size() + " aliases which are " + + mapJoinPathToAliasesEntry.getValue().toString()); + } + String mapJoinAlias = mapJoinPathToAliasesEntry.getValue().get(0); + TableScanOperator mapJoinTaskTableScanOperator = + OperatorUtils.findSingleOperator( + mapJoinAliasToWork.get(mapJoinAlias), TableScanOperator.class); + if (mapJoinTaskTableScanOperator == null) { + throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + + " operator as the work associated with alias " + mapJoinAlias + + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator."); + } + FileSinkOperator mapJoinTaskFileSinkOperator = + OperatorUtils.findSingleOperator( + mapJoinTaskTableScanOperator, FileSinkOperator.class); + if (mapJoinTaskTableScanOperator == null || mapJoinTaskFileSinkOperator == null) { + throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + + " operator at the last operator of the MapJoin Task."); + } + + // The mapJoinTaskFileSinkOperator writes to a different directory + String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName(); + List childMRAliases = childMRWork.getPathToAliases().get(childMRPath); + if (childMRAliases == null || childMRAliases.size() != 1) { return; } + String childMRAlias = childMRAliases.get(0); MapredLocalWork mapJoinLocalWork = mapJoinWork.getMapLocalWork(); - MapredLocalWork childLocalWork = childWork.getMapLocalWork(); - - // Either of them should not be bucketed + MapredLocalWork childLocalWork = childMRWork.getMapLocalWork(); if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) { + // Do not merge if either of them is bucketed. return; } - - if (childWork.getAliasToWork().size() > 1) { + // If there is any aliases has already associated with mapJoinPath in childMRWork, + // we will have shared scan on mapJoinPath once we merge mapJoinTask into childMRTask. + // In this case, we need to check if the total size of local tables is under the limit. + if (childMRWork.getPathToAliases().containsKey(mapJoinPath) && + !isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){ + // The total size of local tables may not be under + // the limit after we merge mapJoinLocalWork and childLocalWork. + // Do not merge. return; } - Operator childAliasOp = - childWork.getAliasToWork().values().iterator().next(); - if (mapJoinTaskFileSinkOperator.getParentOperators().size() > 1) { - return; + TableScanOperator childMRTaskTableScanOperator = + OperatorUtils.findSingleOperator( + childMRWork.getAliasToWork().get(childMRAlias), TableScanOperator.class); + if (childMRTaskTableScanOperator == null) { + throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + + " operator as the work associated with alias " + childMRAlias + + ". Found a " + childMRWork.getAliasToWork().get(childMRAlias).getName() + " operator."); } - // remove the unnecessary TableScan - if (childAliasOp instanceof TableScanOperator) { - TableScanOperator tso = (TableScanOperator)childAliasOp; - if (tso.getNumChild() != 1) { - // shouldn't happen - return; - } - childAliasOp = tso.getChildOperators().get(0); - childAliasOp.getParentOperators().remove(tso); + List> parentsInMapJoinTask = + mapJoinTaskFileSinkOperator.getParentOperators(); + List> childrenInChildMRTask = + childMRTaskTableScanOperator.getChildOperators(); + if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) { + // Do not merge if we do not know how to connect two operator trees. + return; } - // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the - // top of the second - Operator parentFOp = mapJoinTaskFileSinkOperator - .getParentOperators().get(0); - parentFOp.getChildOperators().remove(mapJoinTaskFileSinkOperator); - parentFOp.getChildOperators().add(childAliasOp); - List> parentOps = - new ArrayList>(); - parentOps.add(parentFOp); - childAliasOp.setParentOperators(parentOps); - - mapJoinWork.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo()); - for (Map.Entry childWorkEntry : childWork.getPathToPartitionInfo() - .entrySet()) { - if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) { - mapJoinWork.getPathToPartitionInfo() - .put(childWorkEntry.getKey(), childWorkEntry.getValue()); + // Step 2: Merge mapJoinTask into the Map-side of its child. + // Step 2.1: Connect the operator trees of two MapRedTasks. + Operator parentInMapJoinTask = parentsInMapJoinTask.get(0); + Operator childInChildMRTask = childrenInChildMRTask.get(0); + parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask); + childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask); + + // Step 2.2: Replace the corresponding part childMRWork's MapWork. + childMRWork.replaceMapWork(childMRPath, childMRAlias, + mapJoinPath, mapJoinAlias, mapJoinTaskTableScanOperator, + mapJoinWork.getPathToPartitionInfo().get(mapJoinPath)); + + // Step 2.3: Fill up stuff in local work + if (mapJoinLocalWork != null) { + if (childLocalWork == null) { + childMRWork.setMapLocalWork(mapJoinLocalWork); + } else { + childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork()); + childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork()); } } - // Fill up stuff in local work - if (mapJoinLocalWork != null && childLocalWork != null) { - mapJoinLocalWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork()); - mapJoinLocalWork.getAliasToWork().putAll(childLocalWork.getAliasToWork()); - } - - // remove the child task - List> oldChildTasks = childTask.getChildTasks(); - mapJoinTask.setChildTasks(oldChildTasks); - if (oldChildTasks != null) { - for (Task oldChildTask : oldChildTasks) { - oldChildTask.getParentTasks().remove(childTask); - oldChildTask.getParentTasks().add(mapJoinTask); + // Step 2.4: Remove this MapJoin task + List> parentTasks = mapJoinTask.getParentTasks(); + mapJoinTask.setParentTasks(null); + mapJoinTask.setChildTasks(null); + childMRTask.getParentTasks().remove(mapJoinTask); + if (parentTasks != null) { + childMRTask.getParentTasks().addAll(parentTasks); + for (Task parentTask : parentTasks) { + parentTask.getChildTasks().remove(mapJoinTask); + if (!parentTask.getChildTasks().contains(childMRTask)) { + parentTask.getChildTasks().add(childMRTask); + } + } + } else { + if (physicalContext.getRootTasks().contains(mapJoinTask)) { + physicalContext.removeFromRootTask(mapJoinTask); + if (childMRTask.getParentTasks() != null && + childMRTask.getParentTasks().size() == 0 && + !physicalContext.getRootTasks().contains(childMRTask)) { + physicalContext.addToRootTask(childMRTask); + } } } - - // Copy the reducer conf. - copyReducerConf(mapJoinTask, childTask); + if (childMRTask.getParentTasks().size() == 0) { + childMRTask.setParentTasks(null); + } } public static boolean cannotConvert(String bigTableAlias, diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java index 7cbb1ff..3a4cb57 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java @@ -374,6 +374,46 @@ public void addMapWork(String path, String alias, Operator work, aliasToWork.put(alias, work); } + /** Replace all entries associated with oldAlias in pathToAliases, aliasToWork, + * and aliasToPartnInfo to mappings associated with newPath. + * At the end, if there is no alias associated with oldPath, we also delete entries + * associated with oldPath in pathToAliases and pathToPartitionInfo. + * @param oldPath + * @param newPath + * @param newAlias + * @param newWork + * @param newPd + */ + public void replaceMapWork(String oldPath, String oldAlias, + String newPath, String newAlias, Operator newWork, PartitionDesc newPd) { + if (!pathToAliases.containsKey(oldPath)) { + // If oldPath is not in pathToAliases, there is no entry we need to delete from + // pathToAliases, pathToPartitionInfo, aliasToWork, and aliasToPartnInfo. + return; + } + + // Remove old entries. + pathToAliases.get(oldPath).remove(oldAlias); + aliasToWork.remove(oldAlias); + aliasToPartnInfo.remove(oldAlias); + if (pathToAliases.get(oldPath).size() == 0) { + // oldPath will not be used. So, we can delete entries associated with oldPath + // from pathToAliases and pathToAliases. + pathToAliases.remove(oldPath); + pathToPartitionInfo.remove(oldPath); + } + + // Add new entries. + if (!pathToAliases.containsKey(newPath)) { + // It is the first we see newPath in this MapWork. + pathToAliases.put(newPath, new ArrayList()); + pathToPartitionInfo.put(newPath, newPd); + } + pathToAliases.get(newPath).add(newAlias); + aliasToWork.put(newAlias, newWork); + aliasToPartnInfo.put(newAlias, pathToPartitionInfo.get(newPath)); + } + @SuppressWarnings("nls") public String isInvalid() { if ((getNumReduceTasks() >= 1) && (getReducer() == null)) { diff --git ql/src/test/queries/clientpositive/correlationoptimizer7.q ql/src/test/queries/clientpositive/correlationoptimizer7.q index 9b18972..a52f9b5 100644 --- ql/src/test/queries/clientpositive/correlationoptimizer7.q +++ ql/src/test/queries/clientpositive/correlationoptimizer7.q @@ -42,13 +42,9 @@ set hive.optimize.correlation=false; -- Without correlation optimizer, we will have 3 MR jobs. -- The first one is a MapJoin and Aggregation (in the Reduce Phase). -- The second one is another MapJoin. The third one is for ordering. --- With the correlation optimizer, right now, we still have --- 3 MR jobs. The first one is a MapJoin and the map-side aggregation (a map-only job). --- The second one have the reduce-side aggregation and the second join. --- The third one is for ordering. --- Although we have turned on hive.optimize.mapjoin.mapreduce, that optimizer --- can not handle the case that the MR job (the one which a map-only job will be merged in) --- has multiple inputs. We should improve that optimizer. +-- With the correlation optimizer, right now, we have +-- 2 MR jobs. The first one will evaluate the sub-query xx and the join of +-- xx and yy. The second one will do the ORDER BY. EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt diff --git ql/src/test/queries/clientpositive/multiMapJoin2.q ql/src/test/queries/clientpositive/multiMapJoin2.q new file mode 100644 index 0000000..3e0a9a8 --- /dev/null +++ ql/src/test/queries/clientpositive/multiMapJoin2.q @@ -0,0 +1,221 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.join.noconditionaltask=true; +set hive.auto.convert.join.noconditionaltask.size=6000; + +set hive.optimize.mapjoin.mapreduce=false; +-- When hive.optimize.mapjoin.mapreduce=false, we will generate two Map-only jobs +-- and one MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +set hive.optimize.mapjoin.mapreduce=true; +-- When hive.optimize.mapjoin.mapreduce=true, we will generate one MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +set hive.auto.convert.join.noconditionaltask.size=400; +-- It is possible that after merge, we can have shared scan for the big table. +-- In this case, multiple MapJoins involving this big table can happen in the +-- same Map task. So, when we find we can have shared scan on the big table, +-- we need to check the total size of local tables. If this size is be +-- larger than the limit that +-- we set through hive.auto.convert.join.noconditionaltask.size (right now, it is +-- 400 bytes), we will not do the merge. +-- For this query, we will merge the MapJoin of x2 and y2 into the MR job +-- for UNION ALL and ORDER BY. But, the MapJoin of x1 and y2 will not be merged +-- into that MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +set hive.auto.convert.join.noconditionaltask.size=6000; +set hive.optimize.mapjoin.mapreduce=false; +-- When hive.optimize.mapjoin.mapreduce=false, we will use totally three jobs. +-- We will generate one MR job for GROUP BY +-- on x1, one Map-only job for the MapJoin of x2 and y2, and one MR job +-- for the UNION ALL and ORDER BY. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +set hive.optimize.mapjoin.mapreduce=true; +-- When hive.optimize.mapjoin.mapreduce=true, we will use two jobs. +-- We will generate one MR job for GROUP BY +-- on x1, one MR job for both the MapJoin of x2 and y2, the UNION ALL, and the +-- ORDER BY. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key; + +set hive.optimize.mapjoin.mapreduce=false; +set hive.optimize.correlation=false; +-- When hive.optimize.mapjoin.mapreduce=false and Correlation Optimizer is disabled, +-- we will use 7 jobs. +-- We will generate one Map-only job for the MapJoin of x1 and y1, +-- one Map-only job for the Map-Join of x2 and y2, +-- one MR job for the aggregation in the sub-query tmp1, +-- one MR job for the aggregation in the sub-query tmp2, +-- one MR job for the Join of tmp1 and tmp2, +-- one MR job for aggregation on the result of the Join of tmp1 and tmp2, +-- and one MR job for the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +set hive.optimize.mapjoin.mapreduce=true; +set hive.optimize.correlation=true; +-- When hive.optimize.mapjoin.mapreduce=true and Correlation Optimizer is enabled, +-- we will use two jobs. This first MR job will evaluate sub-queries of tmp1, tmp2, +-- the Join of tmp1 and tmp2, and the aggregation on the result of the Join of +-- tmp1 and tmp2. The second job will do the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +set hive.optimize.mapjoin.mapreduce=false; +set hive.optimize.correlation=false; +-- When hive.optimize.mapjoin.mapreduce=false and Correlation Optimizer is disabled, +-- we will use fix jobs. +-- We will generate one MR job to evaluate the sub-query tmp1, +-- one Map-only job for the Map-Join of x2 and y2, +-- one MR job for the aggregation in the sub-query tmp2, +-- one MR job for the Join of tmp1 and tmp2, +-- one MR job for aggregation on the result of the Join of tmp1 and tmp2, +-- and one MR job for the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +set hive.optimize.mapjoin.mapreduce=true; +set hive.optimize.correlation=true; +-- When hive.optimize.mapjoin.mapreduce=true and Correlation Optimizer is enabled, +-- we will use two job. This first MR job will evaluate sub-queries of tmp1, tmp2, +-- the Join of tmp1 and tmp2, and the aggregation on the result of the Join of +-- tmp1 and tmp2. The second job will do the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt; + diff --git ql/src/test/results/clientpositive/auto_join33.q.out ql/src/test/results/clientpositive/auto_join33.q.out index 8fc0e84..e86d6c8 100644 --- ql/src/test/results/clientpositive/auto_join33.q.out +++ ql/src/test/results/clientpositive/auto_join33.q.out @@ -25,7 +25,7 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-5 is a root stage - Stage-4 depends on stages: Stage-5 + Stage-2 depends on stages: Stage-5 Stage-0 is a root stage STAGE PLANS: @@ -49,7 +49,7 @@ STAGE PLANS: 1 [Column[name]] Position of Big Table: 1 - Stage: Stage-4 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: v diff --git ql/src/test/results/clientpositive/correlationoptimizer1.q.out ql/src/test/results/clientpositive/correlationoptimizer1.q.out index db3bd78..f6dcc8e 100644 --- ql/src/test/results/clientpositive/correlationoptimizer1.q.out +++ ql/src/test/results/clientpositive/correlationoptimizer1.q.out @@ -564,8 +564,8 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-6 is a root stage - Stage-5 depends on stages: Stage-6 - Stage-3 depends on stages: Stage-5 + Stage-2 depends on stages: Stage-6 + Stage-3 depends on stages: Stage-2 Stage-0 is a root stage STAGE PLANS: @@ -589,7 +589,7 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 1 - Stage: Stage-5 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: tmp:y diff --git ql/src/test/results/clientpositive/correlationoptimizer3.q.out ql/src/test/results/clientpositive/correlationoptimizer3.q.out index cebddff..a6691ae 100644 --- ql/src/test/results/clientpositive/correlationoptimizer3.q.out +++ ql/src/test/results/clientpositive/correlationoptimizer3.q.out @@ -598,11 +598,8 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-9 is a root stage - Stage-7 depends on stages: Stage-9 - Stage-2 depends on stages: Stage-7, Stage-8 + Stage-2 depends on stages: Stage-9 Stage-3 depends on stages: Stage-2 - Stage-10 is a root stage - Stage-8 depends on stages: Stage-10 Stage-0 is a root stage STAGE PLANS: @@ -612,6 +609,9 @@ STAGE PLANS: tmp:b:x Fetch Operator limit: -1 + tmp:d:x + Fetch Operator + limit: -1 Alias -> Map Local Operator Tree: tmp:b:x TableScan @@ -625,8 +625,20 @@ STAGE PLANS: 0 [Column[key]] 1 [Column[key]] Position of Big Table: 1 + tmp:d:x + TableScan + alias: x + HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 - Stage: Stage-7 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: tmp:b:y @@ -658,44 +670,54 @@ STAGE PLANS: type: string mode: hash outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col1 + type: bigint + tmp:d:y + TableScan + alias: y + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col1 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col1 + type: string Local Work: Map Reduce Local Work - - Stage: Stage-2 - Map Reduce - Alias -> Map Operator Tree: -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 0 - value expressions: - expr: _col1 - type: bigint -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 1 - value expressions: - expr: _col1 - type: string Reduce Operator Tree: Demux Operator Group By Operator @@ -818,60 +840,6 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - Stage: Stage-10 - Map Reduce Local Work - Alias -> Map Local Tables: - tmp:d:x - Fetch Operator - limit: -1 - Alias -> Map Local Operator Tree: - tmp:d:x - TableScan - alias: x - HashTable Sink Operator - condition expressions: - 0 {key} {value} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 1 - - Stage: Stage-8 - Map Reduce - Alias -> Map Operator Tree: - tmp:d:y - TableScan - alias: y - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {key} {value} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - outputColumnNames: _col0, _col1 - Position of Big Table: 1 - Select Operator - expressions: - expr: _col0 - type: string - expr: _col1 - type: string - outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - Local Work: - Map Reduce Local Work - Stage: Stage-0 Fetch Operator limit: -1 @@ -1482,11 +1450,8 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-9 is a root stage - Stage-7 depends on stages: Stage-9 - Stage-2 depends on stages: Stage-7, Stage-8 + Stage-2 depends on stages: Stage-9 Stage-3 depends on stages: Stage-2 - Stage-10 is a root stage - Stage-8 depends on stages: Stage-10 Stage-0 is a root stage STAGE PLANS: @@ -1496,6 +1461,9 @@ STAGE PLANS: tmp:b:x Fetch Operator limit: -1 + tmp:d:x + Fetch Operator + limit: -1 Alias -> Map Local Operator Tree: tmp:b:x TableScan @@ -1509,8 +1477,20 @@ STAGE PLANS: 0 [Column[key]] 1 [Column[key]] Position of Big Table: 1 + tmp:d:x + TableScan + alias: x + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 - Stage: Stage-7 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: tmp:b:y @@ -1535,44 +1515,61 @@ STAGE PLANS: expr: _col1 type: string outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col1 + type: string + tmp:d:y + TableScan + alias: y + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col1 + type: bigint Local Work: Map Reduce Local Work - - Stage: Stage-2 - Map Reduce - Alias -> Map Operator Tree: -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 0 - value expressions: - expr: _col1 - type: string -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 1 - value expressions: - expr: _col1 - type: bigint Reduce Operator Tree: Demux Operator Mux Operator @@ -1695,67 +1692,6 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - Stage: Stage-10 - Map Reduce Local Work - Alias -> Map Local Tables: - tmp:d:x - Fetch Operator - limit: -1 - Alias -> Map Local Operator Tree: - tmp:d:x - TableScan - alias: x - HashTable Sink Operator - condition expressions: - 0 {key} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 1 - - Stage: Stage-8 - Map Reduce - Alias -> Map Operator Tree: - tmp:d:y - TableScan - alias: y - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {key} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - outputColumnNames: _col0 - Position of Big Table: 1 - Select Operator - expressions: - expr: _col0 - type: string - outputColumnNames: _col0 - Group By Operator - aggregations: - expr: count(1) - bucketGroup: false - keys: - expr: _col0 - type: string - mode: hash - outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - Local Work: - Map Reduce Local Work - Stage: Stage-0 Fetch Operator limit: -1 diff --git ql/src/test/results/clientpositive/correlationoptimizer4.q.out ql/src/test/results/clientpositive/correlationoptimizer4.q.out index 285a54f..3605619 100644 --- ql/src/test/results/clientpositive/correlationoptimizer4.q.out +++ ql/src/test/results/clientpositive/correlationoptimizer4.q.out @@ -428,8 +428,8 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-7 is a root stage - Stage-6 depends on stages: Stage-7 - Stage-3 depends on stages: Stage-6 + Stage-2 depends on stages: Stage-7 + Stage-3 depends on stages: Stage-2 Stage-0 is a root stage STAGE PLANS: @@ -472,7 +472,7 @@ STAGE PLANS: 2 [Column[key]] Position of Big Table: 0 - Stage: Stage-6 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: tmp:x diff --git ql/src/test/results/clientpositive/correlationoptimizer6.q.out ql/src/test/results/clientpositive/correlationoptimizer6.q.out index c40a786..0e6a0aa 100644 --- ql/src/test/results/clientpositive/correlationoptimizer6.q.out +++ ql/src/test/results/clientpositive/correlationoptimizer6.q.out @@ -661,21 +661,33 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-9 is a root stage - Stage-7 depends on stages: Stage-9 - Stage-2 depends on stages: Stage-7, Stage-8 + Stage-2 depends on stages: Stage-9 Stage-3 depends on stages: Stage-2 - Stage-10 is a root stage - Stage-8 depends on stages: Stage-10 Stage-0 is a root stage STAGE PLANS: Stage: Stage-9 Map Reduce Local Work Alias -> Map Local Tables: + xx:y + Fetch Operator + limit: -1 yy:y Fetch Operator limit: -1 Alias -> Map Local Operator Tree: + xx:y + TableScan + alias: y + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 yy:y TableScan alias: y @@ -689,9 +701,50 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 0 - Stage: Stage-7 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: + xx:x + TableScan + alias: x + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col1 + type: bigint yy:x TableScan alias: x @@ -721,44 +774,20 @@ STAGE PLANS: type: string mode: hash outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + value expressions: + expr: _col1 + type: bigint Local Work: Map Reduce Local Work - - Stage: Stage-2 - Map Reduce - Alias -> Map Operator Tree: -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 1 - value expressions: - expr: _col1 - type: bigint -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 0 - value expressions: - expr: _col1 - type: bigint Reduce Operator Tree: Demux Operator Group By Operator @@ -880,67 +909,6 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - Stage: Stage-10 - Map Reduce Local Work - Alias -> Map Local Tables: - xx:y - Fetch Operator - limit: -1 - Alias -> Map Local Operator Tree: - xx:y - TableScan - alias: y - HashTable Sink Operator - condition expressions: - 0 {key} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 0 - - Stage: Stage-8 - Map Reduce - Alias -> Map Operator Tree: - xx:x - TableScan - alias: x - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {key} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - outputColumnNames: _col0 - Position of Big Table: 0 - Select Operator - expressions: - expr: _col0 - type: string - outputColumnNames: _col0 - Group By Operator - aggregations: - expr: count(1) - bucketGroup: false - keys: - expr: _col0 - type: string - mode: hash - outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - Local Work: - Map Reduce Local Work - Stage: Stage-0 Fetch Operator limit: -1 @@ -5122,16 +5090,15 @@ ABSTRACT SYNTAX TREE: (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src1) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key) key) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key)))) xx) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x) (TOK_TABREF (TOK_TABNAME src) y) (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) value) value) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL x) value)))) yy) (= (. (TOK_TABLE_OR_COL xx) key) (. (TOK_TABLE_OR_COL yy) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL xx) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL xx) cnt)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL yy) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL yy) value)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL yy) cnt))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL xx) key)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL xx) cnt)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL yy) key)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL yy) value)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL yy) cnt))))) STAGE DEPENDENCIES: - Stage-10 is a root stage - Stage-8 depends on stages: Stage-10 - Stage-3 depends on stages: Stage-8, Stage-9 - Stage-4 depends on stages: Stage-3 Stage-11 is a root stage - Stage-9 depends on stages: Stage-11 + Stage-2 depends on stages: Stage-11 + Stage-10 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-10 + Stage-4 depends on stages: Stage-3 Stage-0 is a root stage STAGE PLANS: - Stage: Stage-10 + Stage: Stage-11 Map Reduce Local Work Alias -> Map Local Tables: yy:y @@ -5151,7 +5118,7 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 0 - Stage: Stage-8 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: yy:x @@ -5233,6 +5200,26 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Stage: Stage-10 + Map Reduce Local Work + Alias -> Map Local Tables: + xx:y + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + xx:y + TableScan + alias: y + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + Stage: Stage-3 Map Reduce Alias -> Map Operator Tree: @@ -5253,19 +5240,49 @@ STAGE PLANS: type: string expr: _col2 type: bigint -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 0 - value expressions: - expr: _col1 - type: bigint + xx:x + TableScan + alias: x + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col1 + type: bigint + Local Work: + Map Reduce Local Work Reduce Operator Tree: Demux Operator Group By Operator @@ -5379,67 +5396,6 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - Stage: Stage-11 - Map Reduce Local Work - Alias -> Map Local Tables: - xx:y - Fetch Operator - limit: -1 - Alias -> Map Local Operator Tree: - xx:y - TableScan - alias: y - HashTable Sink Operator - condition expressions: - 0 {key} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - Position of Big Table: 0 - - Stage: Stage-9 - Map Reduce - Alias -> Map Operator Tree: - xx:x - TableScan - alias: x - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {key} - 1 - handleSkewJoin: false - keys: - 0 [Column[key]] - 1 [Column[key]] - outputColumnNames: _col0 - Position of Big Table: 0 - Select Operator - expressions: - expr: _col0 - type: string - outputColumnNames: _col0 - Group By Operator - aggregations: - expr: count(1) - bucketGroup: false - keys: - expr: _col0 - type: string - mode: hash - outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - Local Work: - Map Reduce Local Work - Stage: Stage-0 Fetch Operator limit: -1 diff --git ql/src/test/results/clientpositive/correlationoptimizer7.q.out ql/src/test/results/clientpositive/correlationoptimizer7.q.out index ea54431..d343720 100644 --- ql/src/test/results/clientpositive/correlationoptimizer7.q.out +++ ql/src/test/results/clientpositive/correlationoptimizer7.q.out @@ -527,13 +527,9 @@ POSTHOOK: Input: default@src1 PREHOOK: query: -- Without correlation optimizer, we will have 3 MR jobs. -- The first one is a MapJoin and Aggregation (in the Reduce Phase). -- The second one is another MapJoin. The third one is for ordering. --- With the correlation optimizer, right now, we still have --- 3 MR jobs. The first one is a MapJoin and the map-side aggregation (a map-only job). --- The second one have the reduce-side aggregation and the second join. --- The third one is for ordering. --- Although we have turned on hive.optimize.mapjoin.mapreduce, that optimizer --- can not handle the case that the MR job (the one which a map-only job will be merged in) --- has multiple inputs. We should improve that optimizer. +-- With the correlation optimizer, right now, we have +-- 2 MR jobs. The first one will evaluate the sub-query xx and the join of +-- xx and yy. The second one will do the ORDER BY. EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt @@ -545,13 +541,9 @@ PREHOOK: type: QUERY POSTHOOK: query: -- Without correlation optimizer, we will have 3 MR jobs. -- The first one is a MapJoin and Aggregation (in the Reduce Phase). -- The second one is another MapJoin. The third one is for ordering. --- With the correlation optimizer, right now, we still have --- 3 MR jobs. The first one is a MapJoin and the map-side aggregation (a map-only job). --- The second one have the reduce-side aggregation and the second join. --- The third one is for ordering. --- Although we have turned on hive.optimize.mapjoin.mapreduce, that optimizer --- can not handle the case that the MR job (the one which a map-only job will be merged in) --- has multiple inputs. We should improve that optimizer. +-- With the correlation optimizer, right now, we have +-- 2 MR jobs. The first one will evaluate the sub-query xx and the join of +-- xx and yy. The second one will do the ORDER BY. EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt @@ -565,9 +557,9 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-9 is a root stage - Stage-7 depends on stages: Stage-9 - Stage-8 depends on stages: Stage-7 - Stage-6 depends on stages: Stage-8 + Stage-4 depends on stages: Stage-9 + Stage-8 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-8 Stage-0 is a root stage STAGE PLANS: @@ -591,7 +583,7 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 0 - Stage: Stage-7 + Stage: Stage-4 Map Reduce Alias -> Map Operator Tree: xx:x @@ -681,7 +673,7 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 0 - Stage: Stage-6 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: $INTNAME @@ -801,8 +793,7 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-6 is a root stage - Stage-5 depends on stages: Stage-6 - Stage-1 depends on stages: Stage-5 + Stage-1 depends on stages: Stage-6 Stage-2 depends on stages: Stage-1 Stage-0 is a root stage @@ -827,7 +818,7 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 0 - Stage: Stage-5 + Stage: Stage-1 Map Reduce Alias -> Map Operator Tree: xx:x @@ -859,31 +850,18 @@ STAGE PLANS: type: string mode: hash outputColumnNames: _col0, _col1 - File Output Operator - compressed: false - GlobalTableId: 0 - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - Local Work: - Map Reduce Local Work - - Stage: Stage-1 - Map Reduce - Alias -> Map Operator Tree: -#### A masked pattern was here #### - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: 0 - value expressions: - expr: _col1 - type: bigint + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col1 + type: bigint yy TableScan alias: yy @@ -901,6 +879,8 @@ STAGE PLANS: type: string expr: value type: string + Local Work: + Map Reduce Local Work Reduce Operator Tree: Demux Operator Group By Operator diff --git ql/src/test/results/clientpositive/multiMapJoin1.q.out ql/src/test/results/clientpositive/multiMapJoin1.q.out index 3b3eb3f..222b6dd 100644 --- ql/src/test/results/clientpositive/multiMapJoin1.q.out +++ ql/src/test/results/clientpositive/multiMapJoin1.q.out @@ -510,8 +510,8 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-7 is a root stage - Stage-6 depends on stages: Stage-7 - Stage-0 depends on stages: Stage-6 + Stage-2 depends on stages: Stage-7 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-7 @@ -549,7 +549,7 @@ STAGE PLANS: 1 [Column[value]] Position of Big Table: 0 - Stage: Stage-6 + Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: firstjoin:bigtbl @@ -572,44 +572,44 @@ STAGE PLANS: expr: _col1 type: string outputColumnNames: _col1 - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 {key} - handleSkewJoin: false - keys: - 0 [Column[_col1]] - 1 [Column[value]] + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 {key} + handleSkewJoin: false + keys: + 0 [Column[_col1]] + 1 [Column[value]] + outputColumnNames: _col3 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col3 + type: string outputColumnNames: _col3 - Position of Big Table: 0 - Select Operator - expressions: + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: expr: _col3 type: string - outputColumnNames: _col3 - Group By Operator - aggregations: - expr: count() - bucketGroup: false - keys: - expr: _col3 + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 type: string - mode: hash - outputColumnNames: _col0, _col1 - Reduce Output Operator - key expressions: - expr: _col0 - type: string - sort order: + - Map-reduce partition columns: - expr: _col0 - type: string - tag: -1 - value expressions: - expr: _col1 - type: bigint + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint Local Work: Map Reduce Local Work Reduce Operator Tree: @@ -1722,7 +1722,7 @@ ABSTRACT SYNTAX TREE: STAGE DEPENDENCIES: Stage-11 is a root stage - Stage-10 depends on stages: Stage-11 + Stage-4 depends on stages: Stage-11 Stage-0 is a root stage STAGE PLANS: @@ -1776,7 +1776,7 @@ STAGE PLANS: 1 [Column[key]] Position of Big Table: 0 - Stage: Stage-10 + Stage: Stage-4 Map Reduce Alias -> Map Operator Tree: secondjoin:firstjoin:bigtbl @@ -1801,47 +1801,47 @@ STAGE PLANS: expr: _col2 type: string outputColumnNames: _col1, _col2 - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {_col1} - 1 - handleSkewJoin: false - keys: - 0 [Column[_col2]] - 1 [Column[value]] + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col1} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col2]] + 1 [Column[value]] + outputColumnNames: _col1 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col1 + type: string outputColumnNames: _col1 - Position of Big Table: 0 - Select Operator - expressions: - expr: _col1 - type: string - outputColumnNames: _col1 - Map Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - handleSkewJoin: false - keys: - 0 [Column[_col1]] - 1 [Column[key]] - Position of Big Table: 0 - Select Operator - Group By Operator - aggregations: - expr: count() - bucketGroup: false - mode: hash - outputColumnNames: _col0 - Reduce Output Operator - sort order: - tag: -1 - value expressions: - expr: _col0 - type: bigint + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[_col1]] + 1 [Column[key]] + Position of Big Table: 0 + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint Local Work: Map Reduce Local Work Reduce Operator Tree: diff --git ql/src/test/results/clientpositive/multiMapJoin2.q.out ql/src/test/results/clientpositive/multiMapJoin2.q.out new file mode 100644 index 0000000..1d757f9 --- /dev/null +++ ql/src/test/results/clientpositive/multiMapJoin2.q.out @@ -0,0 +1,2943 @@ +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false, we will generate two Map-only jobs +-- and one MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false, we will generate two Map-only jobs +-- and one MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x1) (TOK_TABREF (TOK_TABNAME src1) y1) (= (. (TOK_TABLE_OR_COL x1) key) (. (TOK_TABLE_OR_COL y1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key))))) tmp)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp) key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL tmp) key))))) + +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-6 depends on stages: Stage-8 + Stage-2 depends on stages: Stage-6, Stage-7 + Stage-9 is a root stage + Stage-7 depends on stages: Stage-9 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:tmp-subquery2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:tmp-subquery2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:tmp-subquery2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery1:tmp-subquery1:y1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery1:tmp-subquery1:y1 + TableScan + alias: y1 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:tmp-subquery1:x1 + TableScan + alias: x1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 +128 +128 +128 +128 +128 +146 +146 +146 +146 +150 +150 +213 +213 +213 +213 +224 +224 +224 +224 +238 +238 +238 +238 +255 +255 +255 +255 +273 +273 +273 +273 +273 +273 +278 +278 +278 +278 +311 +311 +311 +311 +311 +311 +369 +369 +369 +369 +369 +369 +401 +401 +401 +401 +401 +401 +401 +401 +401 +401 +406 +406 +406 +406 +406 +406 +406 +406 +66 +66 +98 +98 +98 +98 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true, we will generate one MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true, we will generate one MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x1) (TOK_TABREF (TOK_TABNAME src1) y1) (= (. (TOK_TABLE_OR_COL x1) key) (. (TOK_TABLE_OR_COL y1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key))))) tmp)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp) key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL tmp) key))))) + +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery1:tmp-subquery1:y1 + Fetch Operator + limit: -1 + null-subquery2:tmp-subquery2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery1:tmp-subquery1:y1 + TableScan + alias: y1 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + null-subquery2:tmp-subquery2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:tmp-subquery1:x1 + TableScan + alias: x1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + null-subquery2:tmp-subquery2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 +128 +128 +128 +128 +128 +146 +146 +146 +146 +150 +150 +213 +213 +213 +213 +224 +224 +224 +224 +238 +238 +238 +238 +255 +255 +255 +255 +273 +273 +273 +273 +273 +273 +278 +278 +278 +278 +311 +311 +311 +311 +311 +311 +369 +369 +369 +369 +369 +369 +401 +401 +401 +401 +401 +401 +401 +401 +401 +401 +406 +406 +406 +406 +406 +406 +406 +406 +66 +66 +98 +98 +98 +98 +PREHOOK: query: -- It is possible that after merge, we can have shared scan for the big table. +-- In this case, multiple MapJoins involving this big table can happen in the +-- same Map task. So, when we find we can have shared scan on the big table, +-- we need to check the total size of local tables. If this size is be +-- larger than the limit that +-- we set through hive.auto.convert.join.noconditionaltask.size (right now, it is +-- 400 bytes), we will not do the merge. +-- For this query, we will merge the MapJoin of x2 and y2 into the MR job +-- for UNION ALL and ORDER BY. But, the MapJoin of x1 and y2 will not be merged +-- into that MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +POSTHOOK: query: -- It is possible that after merge, we can have shared scan for the big table. +-- In this case, multiple MapJoins involving this big table can happen in the +-- same Map task. So, when we find we can have shared scan on the big table, +-- we need to check the total size of local tables. If this size is be +-- larger than the limit that +-- we set through hive.auto.convert.join.noconditionaltask.size (right now, it is +-- 400 bytes), we will not do the merge. +-- For this query, we will merge the MapJoin of x2 and y2 into the MR job +-- for UNION ALL and ORDER BY. But, the MapJoin of x1 and y2 will not be merged +-- into that MR job. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x1) (TOK_TABREF (TOK_TABNAME src1) y1) (= (. (TOK_TABLE_OR_COL x1) key) (. (TOK_TABLE_OR_COL y1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key))))) tmp)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp) key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL tmp) key))))) + +STAGE DEPENDENCIES: + Stage-9 is a root stage + Stage-7 depends on stages: Stage-9 + Stage-8 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-8 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery1:tmp-subquery1:y1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery1:tmp-subquery1:y1 + TableScan + alias: y1 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:tmp-subquery1:x1 + TableScan + alias: x1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:tmp-subquery2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:tmp-subquery2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + null-subquery2:tmp-subquery2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 +128 +128 +128 +128 +128 +146 +146 +146 +146 +150 +150 +213 +213 +213 +213 +224 +224 +224 +224 +238 +238 +238 +238 +255 +255 +255 +255 +273 +273 +273 +273 +273 +273 +278 +278 +278 +278 +311 +311 +311 +311 +311 +311 +369 +369 +369 +369 +369 +369 +401 +401 +401 +401 +401 +401 +401 +401 +401 +401 +406 +406 +406 +406 +406 +406 +406 +406 +66 +66 +98 +98 +98 +98 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false, we will use totally three jobs. +-- We will generate one MR job for GROUP BY +-- on x1, one Map-only job for the MapJoin of x2 and y2, and one MR job +-- for the UNION ALL and ORDER BY. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false, we will use totally three jobs. +-- We will generate one MR job for GROUP BY +-- on x1, one Map-only job for the MapJoin of x2 and y2, and one MR job +-- for the UNION ALL and ORDER BY. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src1) x1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x1) key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key))))) tmp)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp) key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL tmp) key))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-2 depends on stages: Stage-4, Stage-5 + Stage-6 is a root stage + Stage-5 depends on stages: Stage-6 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:tmp-subquery1:x1 + TableScan + alias: x1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:tmp-subquery2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:tmp-subquery2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:tmp-subquery2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### + +128 +128 +128 +128 +146 +146 +146 +150 +150 +213 +213 +213 +224 +224 +224 +238 +238 +238 +255 +255 +255 +273 +273 +273 +273 +278 +278 +278 +311 +311 +311 +311 +369 +369 +369 +369 +401 +401 +401 +401 +401 +401 +406 +406 +406 +406 +406 +66 +66 +98 +98 +98 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true, we will use two jobs. +-- We will generate one MR job for GROUP BY +-- on x1, one MR job for both the MapJoin of x2 and y2, the UNION ALL, and the +-- ORDER BY. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true, we will use two jobs. +-- We will generate one MR job for GROUP BY +-- on x1, one MR job for both the MapJoin of x2 and y2, the UNION ALL, and the +-- ORDER BY. +EXPLAIN +SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src1) x1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x1) key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key))))) tmp)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp) key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL tmp) key))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-6 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-6 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:tmp-subquery1:x1 + TableScan + alias: x1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-6 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:tmp-subquery2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:tmp-subquery2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + null-subquery2:tmp-subquery2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Union + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col0 + type: string + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp.key +FROM (SELECT x1.key AS key FROM src1 x1 GROUP BY x1.key + UNION ALL + SELECT x2.key AS key FROM src x2 JOIN src1 y2 ON (x2.key = y2.key)) tmp +ORDER BY tmp.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### + +128 +128 +128 +128 +146 +146 +146 +150 +150 +213 +213 +213 +224 +224 +224 +238 +238 +238 +255 +255 +255 +273 +273 +273 +273 +278 +278 +278 +311 +311 +311 +311 +369 +369 +369 +369 +401 +401 +401 +401 +401 +401 +406 +406 +406 +406 +406 +66 +66 +98 +98 +98 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false and Correlation Optimizer is disabled, +-- we will use 7 jobs. +-- We will generate one Map-only job for the MapJoin of x1 and y1, +-- one Map-only job for the Map-Join of x2 and y2, +-- one MR job for the aggregation in the sub-query tmp1, +-- one MR job for the aggregation in the sub-query tmp2, +-- one MR job for the Join of tmp1 and tmp2, +-- one MR job for aggregation on the result of the Join of tmp1 and tmp2, +-- and one MR job for the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false and Correlation Optimizer is disabled, +-- we will use 7 jobs. +-- We will generate one Map-only job for the MapJoin of x1 and y1, +-- one Map-only job for the Map-Join of x2 and y2, +-- one MR job for the aggregation in the sub-query tmp1, +-- one MR job for the aggregation in the sub-query tmp2, +-- one MR job for the Join of tmp1 and tmp2, +-- one MR job for aggregation on the result of the Join of tmp1 and tmp2, +-- and one MR job for the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x1) (TOK_TABREF (TOK_TABNAME src1) y1) (= (. (TOK_TABLE_OR_COL x1) key) (. (TOK_TABLE_OR_COL y1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x1) key)))) tmp1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x2) key)))) tmp2) (= (. (TOK_TABLE_OR_COL tmp1) key) (. (TOK_TABLE_OR_COL tmp2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp1) key) key) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL tmp1) key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)) (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL cnt))))) + +STAGE DEPENDENCIES: + Stage-17 is a root stage + Stage-13 depends on stages: Stage-17 + Stage-2 depends on stages: Stage-13 + Stage-12 depends on stages: Stage-2, Stage-8 , consists of Stage-15, Stage-16, Stage-3 + Stage-15 has a backup stage: Stage-3 + Stage-10 depends on stages: Stage-15 + Stage-4 depends on stages: Stage-3, Stage-10, Stage-11 + Stage-5 depends on stages: Stage-4 + Stage-16 has a backup stage: Stage-3 + Stage-11 depends on stages: Stage-16 + Stage-3 + Stage-18 is a root stage + Stage-14 depends on stages: Stage-18 + Stage-8 depends on stages: Stage-14 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-17 + Map Reduce Local Work + Alias -> Map Local Tables: + tmp2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + tmp2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-13 + Map Reduce + Alias -> Map Operator Tree: + tmp2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-12 + Conditional Operator + + Stage: Stage-15 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME + HashTable Sink Operator + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-10 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + sort order: ++ + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-16 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME1 + HashTable Sink Operator + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-11 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-18 + Map Reduce Local Work + Alias -> Map Local Tables: + tmp1:y1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + tmp1:y1 + TableScan + alias: y1 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-14 + Map Reduce + Alias -> Map Operator Tree: + tmp1:x1 + TableScan + alias: x1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 1 +146 1 +150 1 +213 1 +224 1 +238 1 +255 1 +273 1 +278 1 +311 1 +369 1 +401 1 +406 1 +66 1 +98 1 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true and Correlation Optimizer is enabled, +-- we will use two jobs. This first MR job will evaluate sub-queries of tmp1, tmp2, +-- the Join of tmp1 and tmp2, and the aggregation on the result of the Join of +-- tmp1 and tmp2. The second job will do the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true and Correlation Optimizer is enabled, +-- we will use two jobs. This first MR job will evaluate sub-queries of tmp1, tmp2, +-- the Join of tmp1 and tmp2, and the aggregation on the result of the Join of +-- tmp1 and tmp2. The second job will do the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x1) (TOK_TABREF (TOK_TABNAME src1) y1) (= (. (TOK_TABLE_OR_COL x1) key) (. (TOK_TABLE_OR_COL y1) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x1) key)))) tmp1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x2) key)))) tmp2) (= (. (TOK_TABLE_OR_COL tmp1) key) (. (TOK_TABLE_OR_COL tmp2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp1) key) key) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL tmp1) key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)) (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL cnt))))) + +STAGE DEPENDENCIES: + Stage-9 is a root stage + Stage-2 depends on stages: Stage-9 + Stage-3 depends on stages: Stage-2 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-9 + Map Reduce Local Work + Alias -> Map Local Tables: + tmp1:y1 + Fetch Operator + limit: -1 + tmp2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + tmp1:y1 + TableScan + alias: y1 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + tmp2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + tmp1:x1 + TableScan + alias: x1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + tmp2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Demux Operator + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + sort order: ++ + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src x1 JOIN src1 y1 ON (x1.key = y1.key) + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 1 +146 1 +150 1 +213 1 +224 1 +238 1 +255 1 +273 1 +278 1 +311 1 +369 1 +401 1 +406 1 +66 1 +98 1 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false and Correlation Optimizer is disabled, +-- we will use fix jobs. +-- We will generate one MR job to evaluate the sub-query tmp1, +-- one Map-only job for the Map-Join of x2 and y2, +-- one MR job for the aggregation in the sub-query tmp2, +-- one MR job for the Join of tmp1 and tmp2, +-- one MR job for aggregation on the result of the Join of tmp1 and tmp2, +-- and one MR job for the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=false and Correlation Optimizer is disabled, +-- we will use fix jobs. +-- We will generate one MR job to evaluate the sub-query tmp1, +-- one Map-only job for the Map-Join of x2 and y2, +-- one MR job for the aggregation in the sub-query tmp2, +-- one MR job for the Join of tmp1 and tmp2, +-- one MR job for aggregation on the result of the Join of tmp1 and tmp2, +-- and one MR job for the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src1) x1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x1) key)))) tmp1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x2) key)))) tmp2) (= (. (TOK_TABLE_OR_COL tmp1) key) (. (TOK_TABLE_OR_COL tmp2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp1) key) key) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL tmp1) key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)) (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL cnt))))) + +STAGE DEPENDENCIES: + Stage-7 is a root stage + Stage-10 depends on stages: Stage-2, Stage-7 , consists of Stage-12, Stage-13, Stage-3 + Stage-12 has a backup stage: Stage-3 + Stage-8 depends on stages: Stage-12 + Stage-4 depends on stages: Stage-3, Stage-8, Stage-9 + Stage-5 depends on stages: Stage-4 + Stage-13 has a backup stage: Stage-3 + Stage-9 depends on stages: Stage-13 + Stage-3 + Stage-14 is a root stage + Stage-11 depends on stages: Stage-14 + Stage-2 depends on stages: Stage-11 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + tmp1:x1 + TableScan + alias: x1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-10 + Conditional Operator + + Stage: Stage-12 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME + HashTable Sink Operator + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 0 + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + sort order: ++ + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-13 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME1 + HashTable Sink Operator + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + Position of Big Table: 1 + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} + 1 + handleSkewJoin: false + keys: + 0 [Column[_col0]] + 1 [Column[_col0]] + outputColumnNames: _col0 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-14 + Map Reduce Local Work + Alias -> Map Local Tables: + tmp2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + tmp2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-11 + Map Reduce + Alias -> Map Operator Tree: + tmp2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 1 +146 1 +150 1 +213 1 +224 1 +238 1 +255 1 +273 1 +278 1 +311 1 +369 1 +401 1 +406 1 +66 1 +98 1 +PREHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true and Correlation Optimizer is enabled, +-- we will use two job. This first MR job will evaluate sub-queries of tmp1, tmp2, +-- the Join of tmp1 and tmp2, and the aggregation on the result of the Join of +-- tmp1 and tmp2. The second job will do the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +POSTHOOK: query: -- When hive.optimize.mapjoin.mapreduce=true and Correlation Optimizer is enabled, +-- we will use two job. This first MR job will evaluate sub-queries of tmp1, tmp2, +-- the Join of tmp1 and tmp2, and the aggregation on the result of the Join of +-- tmp1 and tmp2. The second job will do the ORDER BY. +EXPLAIN +SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src1) x1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x1) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x1) key)))) tmp1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME src) x2) (TOK_TABREF (TOK_TABNAME src1) y2) (= (. (TOK_TABLE_OR_COL x2) key) (. (TOK_TABLE_OR_COL y2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x2) key) key)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x2) key)))) tmp2) (= (. (TOK_TABLE_OR_COL tmp1) key) (. (TOK_TABLE_OR_COL tmp2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL tmp1) key) key) (TOK_SELEXPR (TOK_FUNCTIONSTAR count) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL tmp1) key)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)) (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL cnt))))) + +STAGE DEPENDENCIES: + Stage-7 is a root stage + Stage-2 depends on stages: Stage-7 + Stage-3 depends on stages: Stage-2 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + tmp2:y2 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + tmp2:y2 + TableScan + alias: y2 + HashTable Sink Operator + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + tmp1:x1 + TableScan + alias: x1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + tmp2:x2 + TableScan + alias: x2 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 1 + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Demux Operator + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Mux Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: _col0 + type: string + mode: complete + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + sort order: ++ + tag: -1 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + Reduce Operator Tree: + Extract + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Input: default@src1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT tmp1.key as key, count(*) as cnt +FROM (SELECT x1.key AS key + FROM src1 x1 + GROUP BY x1.key) tmp1 +JOIN (SELECT x2.key AS key + FROM src x2 JOIN src1 y2 ON (x2.key = y2.key) + GROUP BY x2.key) tmp2 +ON (tmp1.key = tmp2.key) +GROUP BY tmp1.key +ORDER BY key, cnt +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Input: default@src1 +#### A masked pattern was here #### +128 1 +146 1 +150 1 +213 1 +224 1 +238 1 +255 1 +273 1 +278 1 +311 1 +369 1 +401 1 +406 1 +66 1 +98 1