diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 31764d9..062e520 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -686,20 +686,6 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HIVEALIAS("hive.alias", "", ""), HIVEMAPSIDEAGGREGATE("hive.map.aggr", true, "Whether to use map-side aggregation in Hive Group By queries"), HIVEGROUPBYSKEW("hive.groupby.skewindata", false, "Whether there is skew in data to optimize group by queries"), - HIVE_OPTIMIZE_MULTI_GROUPBY_COMMON_DISTINCTS("hive.optimize.multigroupby.common.distincts", true, - "Whether to optimize a multi-groupby query with the same distinct.\n" + - "Consider a query like:\n" + - "\n" + - " from src\n" + - " insert overwrite table dest1 select col1, count(distinct colx) group by col1\n" + - " insert overwrite table dest2 select col2, count(distinct colx) group by col2;\n" + - "\n" + - "With this parameter set to true, first we spray by the distinct value (colx), and then\n" + - "perform the 2 groups bys. This makes sense if map-side aggregation is turned off. However,\n" + - "with maps-side aggregation, it might be useful in some cases to treat the 2 inserts independently, \n" + - "thereby performing the query above in 2MR jobs instead of 3 (due to spraying by distinct key first).\n" + - "If this parameter is turned off, we don't consider the fact that the distinct key is the same across\n" + - "different MR jobs."), HIVEJOINEMITINTERVAL("hive.join.emit.interval", 1000, "How many rows in the right-most join operand Hive should buffer before emitting the join result."), HIVEJOINCACHESIZE("hive.join.cache.size", 25000, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java index 2e40556..7918194 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java @@ -80,7 +80,7 @@ private transient ObjectInspector[][] aggregationParameterObjectInspectors; private transient ObjectInspector[][] aggregationParameterStandardObjectInspectors; private transient Object[][] aggregationParameterObjects; - + // so aggregationIsDistinct is a boolean array instead of a single number. private transient boolean[] aggregationIsDistinct; // Map from integer tag to distinct aggrs @@ -107,17 +107,8 @@ // Used by hash-based GroupBy: Mode = HASH, PARTIALS private transient HashMap hashAggregations; - // Used by hash distinct aggregations when hashGrpKeyNotRedKey is true - private transient HashSet keysCurrentGroup; - private transient boolean firstRow; private transient boolean hashAggr; - // The reduction is happening on the reducer, and the grouping key and - // reduction keys are different. - // For example: select a, count(distinct b) from T group by a - // The data is sprayed by 'b' and the reducer is grouping it by 'a' - private transient boolean groupKeyIsNotReduceKey; - private transient boolean firstRowInGroup; private transient long numRowsInput; private transient long numRowsHashTbl; private transient int groupbyMapAggrInterval; @@ -133,8 +124,8 @@ private transient boolean groupingSetsPresent; // generates grouping set private transient int groupingSetsPosition; // position of grouping set, generally the last of keys - private transient List groupingSets; // declared grouping set values - private transient FastBitSet[] groupingSetsBitSet; // bitsets acquired from grouping set values + private transient List groupingSets; // declared grouping set values + private transient FastBitSet[] groupingSetsBitSet; // bitsets acquired from grouping set values private transient Text[] newKeysGroupingSets; // for these positions, some variable primitive type (String) is used, so size @@ -362,10 +353,6 @@ protected void initializeOp(Configuration hconf) throws HiveException { numRowsCompareHashAggr = groupbyMapAggrInterval; minReductionHashAggr = HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION); - groupKeyIsNotReduceKey = conf.getGroupKeyNotReductionKey(); - if (groupKeyIsNotReduceKey) { - keysCurrentGroup = new HashSet(); - } } List fieldNames = new ArrayList(conf.getOutputColumnNames()); @@ -375,7 +362,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { outputKeyLength = conf.pruneGroupingSetId() ? keyFields.length - 1 : keyFields.length; // init objectInspectors - ObjectInspector[] objectInspectors = + ObjectInspector[] objectInspectors = new ObjectInspector[outputKeyLength + aggregationEvaluators.length]; for (int i = 0; i < outputKeyLength; i++) { objectInspectors[i] = currentKeyObjectInspectors[i]; @@ -696,19 +683,6 @@ protected void updateAggregations(AggregationBuffer[] aggs, Object row, } } - @Override - public void startGroup() throws HiveException { - firstRowInGroup = true; - super.startGroup(); - } - - @Override - public void endGroup() throws HiveException { - if (groupKeyIsNotReduceKey) { - keysCurrentGroup.clear(); - } - } - private void processKey(Object row, ObjectInspector rowInspector) throws HiveException { if (hashAggr) { @@ -718,8 +692,6 @@ private void processKey(Object row, processAggr(row, rowInspector, newKeys); } - firstRowInGroup = false; - if (countAfterReport != 0 && (countAfterReport % heartbeatInterval) == 0 && (reporter != null)) { reporter.progress(); @@ -732,7 +704,7 @@ public void processOp(Object row, int tag) throws HiveException { firstRow = false; ObjectInspector rowInspector = inputObjInspectors[tag]; // Total number of input rows is needed for hash aggregation only - if (hashAggr && !groupKeyIsNotReduceKey) { + if (hashAggr) { numRowsInput++; // if hash aggregation is not behaving properly, disable it if (numRowsInput == numRowsCompareHashAggr) { @@ -808,15 +780,6 @@ private void processHashAggr(Object row, ObjectInspector rowInspector, numRowsHashTbl++; // new entry in the hash table } - // If the grouping key and the reduction key are different, a set of - // grouping keys for the current reduction key are maintained in - // keysCurrentGroup - // Peek into the set to find out if a new grouping key is seen for the given - // reduction key - if (groupKeyIsNotReduceKey) { - newEntryForHashAggr = keysCurrentGroup.add(newKeys.copyKey()); - } - // Update the aggs updateAggregations(aggs, row, rowInspector, true, newEntryForHashAggr, null); @@ -826,10 +789,7 @@ private void processHashAggr(Object row, ObjectInspector rowInspector, // Based on user-specified parameters, check if the hash table needs to be // flushed. - // If the grouping key is not the same as reduction key, flushing can only - // happen at boundaries - if ((!groupKeyIsNotReduceKey || firstRowInGroup) - && shouldBeFlushed(newKeys)) { + if ( shouldBeFlushed(newKeys)) { flushHashTable(false); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 428d2ab..aca4273 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -1086,10 +1086,6 @@ private boolean validateGroupByOperator(GroupByOperator op, boolean isReduce, bo return false; } LOG.info("Reduce GROUP BY mode is " + desc.getMode().name()); - if (desc.getGroupKeyNotReductionKey()) { - LOG.info("Reduce vector mode not supported when group key is not reduction key"); - return false; - } if (!aggregatorsOutputIsPrimitive(desc.getAggregators(), isReduce)) { LOG.info("Reduce vector mode only supported when aggregate outputs are primitive types"); return false; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 2d5e6cf..2466d78 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -230,7 +230,7 @@ private HashMap opToPartList; private HashMap> topOps; private final HashMap> topSelOps; - private LinkedHashMap, OpParseContext> opParseCtx; + private final LinkedHashMap, OpParseContext> opParseCtx; private List loadTableWork; private List loadFileWork; private final Map joinContext; @@ -4170,8 +4170,6 @@ private void processGroupingSetReduceSinkOperator(RowResolver reduceSinkInputRow * @param genericUDAFEvaluators * The mapping from Aggregation StringTree to the * genericUDAFEvaluator. - * @param distPartAggr - * partial aggregation for distincts * @param groupingSets * list of grouping sets * @param groupingSetsPresent @@ -4184,7 +4182,6 @@ private void processGroupingSetReduceSinkOperator(RowResolver reduceSinkInputRow private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map genericUDAFEvaluators, - boolean distPartAgg, List groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException { @@ -4283,8 +4280,7 @@ private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, // Otherwise, we look for b+c. // For distincts, partial aggregation is never performed on the client // side, so always look for the parameters: d+e - boolean partialAggDone = !(distPartAgg || isDistinct); - if (!partialAggDone) { + if (isDistinct) { // 0 is the function name for (int i = 1; i < value.getChildCount(); i++) { ASTNode paraExpr = (ASTNode) value.getChild(i); @@ -4303,7 +4299,6 @@ private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1); - } ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), @@ -4316,9 +4311,7 @@ private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, // this parameter is a constant expr = reduceValue; } - aggParameters.add(expr); - } } else { ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value); @@ -4334,19 +4327,11 @@ private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, if (isDistinct) { numDistinctUDFs++; } - boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; + Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = null; - // For distincts, partial aggregations have not been done - if (distPartAgg) { - genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, - value, isDistinct, isAllColumns); - assert (genericUDAFEvaluator != null); - genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator); - } else { - genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey()); - assert (genericUDAFEvaluator != null); - } + genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey()); + assert (genericUDAFEvaluator != null); GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters); @@ -4370,7 +4355,7 @@ private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, // additional rows corresponding to grouping sets need to be created here. Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild( new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, - distPartAgg, groupByMemoryUsage, memoryThreshold, + groupByMemoryUsage, memoryThreshold, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), @@ -5259,64 +5244,6 @@ private Operator genGroupByPlan1ReduceMultiGBY(List dests, QB qb, Operat } /** - * Generate a Multi Group-By plan using a 2 map-reduce jobs. - * - * @param dest - * @param qb - * @param input - * @return - * @throws SemanticException - * - * Generate a Group-By plan using a 2 map-reduce jobs. Spray by the - * distinct key in hope of getting a uniform distribution, and - * compute partial aggregates by the grouping key. Evaluate partial - * aggregates first, and spray by the grouping key to compute actual - * aggregates in the second phase. The aggregation evaluation - * functions are as follows: Partitioning Key: distinct key - * - * Sorting Key: distinct key - * - * Reducer: iterate/terminatePartial (mode = PARTIAL1) - * - * STAGE 2 - * - * Partitioning Key: grouping key - * - * Sorting Key: grouping key - * - * Reducer: merge/terminate (mode = FINAL) - */ - @SuppressWarnings("nls") - private Operator genGroupByPlan2MRMultiGroupBy(String dest, QB qb, - Operator input) throws SemanticException { - - // ////// Generate GroupbyOperator for a map-side partial aggregation - Map genericUDAFEvaluators = - new LinkedHashMap(); - - QBParseInfo parseInfo = qb.getParseInfo(); - - // ////// 2. Generate GroupbyOperator - Operator groupByOperatorInfo = genGroupByPlanGroupByOperator1(parseInfo, - dest, input, GroupByDesc.Mode.HASH, genericUDAFEvaluators, true, - null, false, false); - - int numReducers = -1; - List grpByExprs = getGroupByForClause(parseInfo, dest); - - // ////// 3. Generate ReduceSinkOperator2 - Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR( - parseInfo, dest, groupByOperatorInfo, grpByExprs.size(), numReducers, false); - - // ////// 4. Generate GroupbyOperator2 - Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator2MR(parseInfo, - dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, - genericUDAFEvaluators, false); - - return groupByOperatorInfo2; - } - - /** * Generate a Group-By plan using a 2 map-reduce jobs (5 operators will be * inserted): * @@ -5640,7 +5567,7 @@ private Operator genGroupByPlanMapAggrNoSkew(String dest, QB qb, // on the reducer. return genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.MERGEPARTIAL, - genericUDAFEvaluators, false, + genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob); } else @@ -5653,7 +5580,7 @@ private Operator genGroupByPlanMapAggrNoSkew(String dest, QB qb, Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, - genericUDAFEvaluators, false, + genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob); // ////// Generate ReduceSinkOperator2 @@ -5784,7 +5711,7 @@ private Operator genGroupByPlanMapAggr2MR(String dest, QB qb, // ////// Generate GroupbyOperator for a partial aggregation Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, - genericUDAFEvaluators, false, + genericUDAFEvaluators, groupingSets, groupingSetsPresent, false); int numReducers = -1; @@ -8419,166 +8346,6 @@ private Operator insertSelectAllPlanForGroupBy(Operator input) return output; } - // Return the common distinct expression - // There should be more than 1 destination, with group bys in all of them. - private List getCommonDistinctExprs(QB qb, Operator input) { - QBParseInfo qbp = qb.getParseInfo(); - // If a grouping set aggregation is present, common processing is not possible - if (!qbp.getDestCubes().isEmpty() || !qbp.getDestRollups().isEmpty() - || !qbp.getDestToLateralView().isEmpty()) { - return null; - } - - RowResolver inputRR = opParseCtx.get(input).getRowResolver(); - TreeSet ks = new TreeSet(); - ks.addAll(qbp.getClauseNames()); - - // Go over all the destination tables - if (ks.size() <= 1) { - return null; - } - - List oldList = null; - List oldASTList = null; - - for (String dest : ks) { - // If a filter is present, common processing is not possible - if (qbp.getWhrForClause(dest) != null) { - return null; - } - - if (qbp.getAggregationExprsForClause(dest).size() == 0 - && getGroupByForClause(qbp, dest).size() == 0) { - return null; - } - - // All distinct expressions must be the same - List list = qbp.getDistinctFuncExprsForClause(dest); - if (list.isEmpty()) { - return null; - } - - List currDestList; - try { - currDestList = getDistinctExprs(qbp, dest, inputRR); - } catch (SemanticException e) { - return null; - } - - List currASTList = new ArrayList(); - for (ASTNode value : list) { - // 0 is function name - for (int i = 1; i < value.getChildCount(); i++) { - ASTNode parameter = (ASTNode) value.getChild(i); - currASTList.add(parameter); - } - if (oldList == null) { - oldList = currDestList; - oldASTList = currASTList; - } else { - if (!matchExprLists(oldList, currDestList)) { - return null; - } - } - } - } - - return oldASTList; - } - - private Operator createCommonReduceSink(QB qb, Operator input) - throws SemanticException { - // Go over all the tables and extract the common distinct key - List distExprs = getCommonDistinctExprs(qb, input); - - QBParseInfo qbp = qb.getParseInfo(); - TreeSet ks = new TreeSet(); - ks.addAll(qbp.getClauseNames()); - - // Pass the entire row - RowResolver inputRR = opParseCtx.get(input).getRowResolver(); - RowResolver reduceSinkOutputRowResolver = new RowResolver(); - reduceSinkOutputRowResolver.setIsExprResolver(true); - ArrayList reduceKeys = new ArrayList(); - ArrayList reduceValues = new ArrayList(); - Map colExprMap = new HashMap(); - - // Pre-compute distinct group-by keys and store in reduceKeys - - List outputColumnNames = new ArrayList(); - for (ASTNode distn : distExprs) { - ExprNodeDesc distExpr = genExprNodeDesc(distn, inputRR); - if (reduceSinkOutputRowResolver.getExpression(distn) == null) { - reduceKeys.add(distExpr); - outputColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); - String field = Utilities.ReduceField.KEY.toString() + "." - + getColumnInternalName(reduceKeys.size() - 1); - ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( - reduceKeys.size() - 1).getTypeInfo(), "", false); - reduceSinkOutputRowResolver.putExpression(distn, colInfo); - colExprMap.put(colInfo.getInternalName(), distExpr); - } - } - - // Go over all the grouping keys and aggregations - for (String dest : ks) { - - List grpByExprs = getGroupByForClause(qbp, dest); - for (int i = 0; i < grpByExprs.size(); ++i) { - ASTNode grpbyExpr = grpByExprs.get(i); - - if (reduceSinkOutputRowResolver.getExpression(grpbyExpr) == null) { - ExprNodeDesc grpByExprNode = genExprNodeDesc(grpbyExpr, inputRR); - reduceValues.add(grpByExprNode); - String field = Utilities.ReduceField.VALUE.toString() + "." - + getColumnInternalName(reduceValues.size() - 1); - ColumnInfo colInfo = new ColumnInfo(field, reduceValues.get( - reduceValues.size() - 1).getTypeInfo(), "", false); - reduceSinkOutputRowResolver.putExpression(grpbyExpr, colInfo); - outputColumnNames.add(getColumnInternalName(reduceValues.size() - 1)); - colExprMap.put(field, grpByExprNode); - } - } - - // For each aggregation - HashMap aggregationTrees = qbp - .getAggregationExprsForClause(dest); - assert (aggregationTrees != null); - - for (Map.Entry entry : aggregationTrees.entrySet()) { - ASTNode value = entry.getValue(); - - // 0 is the function name - for (int i = 1; i < value.getChildCount(); i++) { - ASTNode paraExpr = (ASTNode) value.getChild(i); - - if (reduceSinkOutputRowResolver.getExpression(paraExpr) == null) { - ExprNodeDesc paraExprNode = genExprNodeDesc(paraExpr, inputRR); - reduceValues.add(paraExprNode); - String field = Utilities.ReduceField.VALUE.toString() + "." - + getColumnInternalName(reduceValues.size() - 1); - ColumnInfo colInfo = new ColumnInfo(field, reduceValues.get( - reduceValues.size() - 1).getTypeInfo(), "", false); - reduceSinkOutputRowResolver.putExpression(paraExpr, colInfo); - outputColumnNames - .add(getColumnInternalName(reduceValues.size() - 1)); - colExprMap.put(field, paraExprNode); - } - } - } - } - - ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap( - OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, - reduceValues, outputColumnNames, true, -1, reduceKeys.size(), -1, - AcidUtils.Operation.NOT_ACID), - new RowSchema(reduceSinkOutputRowResolver.getColumnInfos()), input), - reduceSinkOutputRowResolver); - - rsOp.setColumnExprMap(colExprMap); - return rsOp; - } - // Groups the clause names into lists so that any two clauses in the same list has the same // group by and distinct keys and no clause appears in more than one list. Returns a list of the // lists of clauses. @@ -8756,157 +8523,114 @@ private Operator genBodyPlan(QB qb, Operator input, Map aliasT TreeSet ks = new TreeSet(qbp.getClauseNames()); Map> inputs = createInputForDests(qb, input, ks); - // For multi-group by with the same distinct, we ignore all user hints - // currently. It doesnt matter whether he has asked to do - // map-side aggregation or not. Map side aggregation is turned off - List commonDistinctExprs = getCommonDistinctExprs(qb, input); - - // Consider a query like: - // - // from src - // insert overwrite table dest1 select col1, count(distinct colx) group by col1 - // insert overwrite table dest2 select col2, count(distinct colx) group by col2; - // - // With HIVE_OPTIMIZE_MULTI_GROUPBY_COMMON_DISTINCTS set to true, first we spray by the distinct - // value (colx), and then perform the 2 groups bys. This makes sense if map-side aggregation is - // turned off. However, with maps-side aggregation, it might be useful in some cases to treat - // the 2 inserts independently, thereby performing the query above in 2MR jobs instead of 3 - // (due to spraying by distinct key first). - boolean optimizeMultiGroupBy = commonDistinctExprs != null && - conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_MULTI_GROUPBY_COMMON_DISTINCTS); Operator curr = input; - // if there is a single distinct, optimize that. Spray initially by the - // distinct key, - // no computation at the mapper. Have multiple group by operators at the - // reducer - and then - // proceed - if (optimizeMultiGroupBy) { - curr = createCommonReduceSink(qb, input); - - RowResolver currRR = opParseCtx.get(curr).getRowResolver(); - // create a forward operator - input = putOpInsertMap(OperatorFactory.getAndMakeChild(new ForwardDesc(), - new RowSchema(currRR.getColumnInfos()), curr), currRR); - - for (String dest : ks) { - curr = input; - curr = genGroupByPlan2MRMultiGroupBy(dest, qb, curr); - curr = genSelectPlan(dest, qb, curr, null); // TODO: we may need to pass "input" here instead of null - Integer limit = qbp.getDestLimit(dest); - if (limit != null) { - curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), true); - qb.getParseInfo().setOuterQueryLimit(limit.intValue()); - } - curr = genFileSinkPlan(dest, qb, curr); - } - } else { - List> commonGroupByDestGroups = null; + List> commonGroupByDestGroups = null; - // If we can put multiple group bys in a single reducer, determine suitable groups of - // expressions, otherwise treat all the expressions as a single group - if (conf.getBoolVar(HiveConf.ConfVars.HIVEMULTIGROUPBYSINGLEREDUCER)) { - try { - commonGroupByDestGroups = getCommonGroupByDestGroups(qb, inputs); - } catch (SemanticException e) { - LOG.error("Failed to group clauses by common spray keys.", e); - } + // If we can put multiple group bys in a single reducer, determine suitable groups of + // expressions, otherwise treat all the expressions as a single group + if (conf.getBoolVar(HiveConf.ConfVars.HIVEMULTIGROUPBYSINGLEREDUCER)) { + try { + commonGroupByDestGroups = getCommonGroupByDestGroups(qb, inputs); + } catch (SemanticException e) { + LOG.error("Failed to group clauses by common spray keys.", e); } + } - if (commonGroupByDestGroups == null) { - commonGroupByDestGroups = new ArrayList>(); - commonGroupByDestGroups.add(new ArrayList(ks)); - } + if (commonGroupByDestGroups == null) { + commonGroupByDestGroups = new ArrayList>(); + commonGroupByDestGroups.add(new ArrayList(ks)); + } - if (!commonGroupByDestGroups.isEmpty()) { + if (!commonGroupByDestGroups.isEmpty()) { - // Iterate over each group of subqueries with the same group by/distinct keys - for (List commonGroupByDestGroup : commonGroupByDestGroups) { - if (commonGroupByDestGroup.isEmpty()) { - continue; - } + // Iterate over each group of subqueries with the same group by/distinct keys + for (List commonGroupByDestGroup : commonGroupByDestGroups) { + if (commonGroupByDestGroup.isEmpty()) { + continue; + } - String firstDest = commonGroupByDestGroup.get(0); - input = inputs.get(firstDest); - - // Constructs a standard group by plan if: - // There is no other subquery with the same group by/distinct keys or - // (There are no aggregations in a representative query for the group and - // There is no group by in that representative query) or - // The data is skewed or - // The conf variable used to control combining group bys into a single reducer is false - if (commonGroupByDestGroup.size() == 1 || - (qbp.getAggregationExprsForClause(firstDest).size() == 0 && - getGroupByForClause(qbp, firstDest).size() == 0) || - conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) || - !conf.getBoolVar(HiveConf.ConfVars.HIVEMULTIGROUPBYSINGLEREDUCER)) { - - // Go over all the destination tables - for (String dest : commonGroupByDestGroup) { - curr = inputs.get(dest); - - if (qbp.getWhrForClause(dest) != null) { - ASTNode whereExpr = qb.getParseInfo().getWhrForClause(dest); - curr = genFilterPlan((ASTNode) whereExpr.getChild(0), qb, curr, aliasToOpInfo, false); + String firstDest = commonGroupByDestGroup.get(0); + input = inputs.get(firstDest); + + // Constructs a standard group by plan if: + // There is no other subquery with the same group by/distinct keys or + // (There are no aggregations in a representative query for the group and + // There is no group by in that representative query) or + // The data is skewed or + // The conf variable used to control combining group bys into a single reducer is false + if (commonGroupByDestGroup.size() == 1 || + (qbp.getAggregationExprsForClause(firstDest).size() == 0 && + getGroupByForClause(qbp, firstDest).size() == 0) || + conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) || + !conf.getBoolVar(HiveConf.ConfVars.HIVEMULTIGROUPBYSINGLEREDUCER)) { + + // Go over all the destination tables + for (String dest : commonGroupByDestGroup) { + curr = inputs.get(dest); + + if (qbp.getWhrForClause(dest) != null) { + ASTNode whereExpr = qb.getParseInfo().getWhrForClause(dest); + curr = genFilterPlan((ASTNode) whereExpr.getChild(0), qb, curr, aliasToOpInfo, false); + } + // Preserve operator before the GBY - we'll use it to resolve '*' + Operator gbySource = curr; + + if (qbp.getAggregationExprsForClause(dest).size() != 0 + || getGroupByForClause(qbp, dest).size() > 0) { + // multiple distincts is not supported with skew in data + if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) && + qbp.getDistinctFuncExprsForClause(dest).size() > 1) { + throw new SemanticException(ErrorMsg.UNSUPPORTED_MULTIPLE_DISTINCTS. + getMsg()); } - // Preserve operator before the GBY - we'll use it to resolve '*' - Operator gbySource = curr; - - if (qbp.getAggregationExprsForClause(dest).size() != 0 - || getGroupByForClause(qbp, dest).size() > 0) { - // multiple distincts is not supported with skew in data - if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) && - qbp.getDistinctFuncExprsForClause(dest).size() > 1) { - throw new SemanticException(ErrorMsg.UNSUPPORTED_MULTIPLE_DISTINCTS. - getMsg()); + // insert a select operator here used by the ColumnPruner to reduce + // the data to shuffle + curr = insertSelectAllPlanForGroupBy(curr); + // Check and transform group by *. This will only happen for select distinct *. + // Here the "genSelectPlan" is being leveraged. + // The main benefits are (1) remove virtual columns that should + // not be included in the group by; (2) add the fully qualified column names to unParseTranslator + // so that view is supported. The drawback is that an additional SEL op is added. If it is + // not necessary, it will be removed by NonBlockingOpDeDupProc Optimizer because it will match + // SEL%SEL% rule. + ASTNode selExprList = qbp.getSelForClause(dest); + if (selExprList.getToken().getType() == HiveParser.TOK_SELECTDI + && selExprList.getChildCount() == 1 && selExprList.getChild(0).getChildCount() == 1) { + ASTNode node = (ASTNode) selExprList.getChild(0).getChild(0); + if (node.getToken().getType() == HiveParser.TOK_ALLCOLREF) { + curr = genSelectPlan(dest, qb, curr, curr); + RowResolver rr = opParseCtx.get(curr).getRowResolver(); + qbp.setSelExprForClause(dest, SemanticAnalyzer.genSelectDIAST(rr)); } - // insert a select operator here used by the ColumnPruner to reduce - // the data to shuffle - curr = insertSelectAllPlanForGroupBy(curr); - // Check and transform group by *. This will only happen for select distinct *. - // Here the "genSelectPlan" is being leveraged. - // The main benefits are (1) remove virtual columns that should - // not be included in the group by; (2) add the fully qualified column names to unParseTranslator - // so that view is supported. The drawback is that an additional SEL op is added. If it is - // not necessary, it will be removed by NonBlockingOpDeDupProc Optimizer because it will match - // SEL%SEL% rule. - ASTNode selExprList = qbp.getSelForClause(dest); - if (selExprList.getToken().getType() == HiveParser.TOK_SELECTDI - && selExprList.getChildCount() == 1 && selExprList.getChild(0).getChildCount() == 1) { - ASTNode node = (ASTNode) selExprList.getChild(0).getChild(0); - if (node.getToken().getType() == HiveParser.TOK_ALLCOLREF) { - curr = genSelectPlan(dest, qb, curr, curr); - RowResolver rr = opParseCtx.get(curr).getRowResolver(); - qbp.setSelExprForClause(dest, SemanticAnalyzer.genSelectDIAST(rr)); - } - } - if (conf.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { - if (!conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { - curr = genGroupByPlanMapAggrNoSkew(dest, qb, curr); - } else { - curr = genGroupByPlanMapAggr2MR(dest, qb, curr); - } - } else if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { - curr = genGroupByPlan2MR(dest, qb, curr); + } + if (conf.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { + if (!conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + curr = genGroupByPlanMapAggrNoSkew(dest, qb, curr); } else { - curr = genGroupByPlan1MR(dest, qb, curr); + curr = genGroupByPlanMapAggr2MR(dest, qb, curr); } + } else if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + curr = genGroupByPlan2MR(dest, qb, curr); + } else { + curr = genGroupByPlan1MR(dest, qb, curr); } - if (LOG.isDebugEnabled()) { - LOG.debug("RR before GB " + opParseCtx.get(gbySource).getRowResolver() - + " after GB " + opParseCtx.get(curr).getRowResolver()); - } - - curr = genPostGroupByBodyPlan(curr, dest, qb, aliasToOpInfo, gbySource); } - } else { - curr = genGroupByPlan1ReduceMultiGBY(commonGroupByDestGroup, qb, input, aliasToOpInfo); + if (LOG.isDebugEnabled()) { + LOG.debug("RR before GB " + opParseCtx.get(gbySource).getRowResolver() + + " after GB " + opParseCtx.get(curr).getRowResolver()); + } + + curr = genPostGroupByBodyPlan(curr, dest, qb, aliasToOpInfo, gbySource); } + } else { + curr = genGroupByPlan1ReduceMultiGBY(commonGroupByDestGroup, qb, input, aliasToOpInfo); } } } + if (LOG.isDebugEnabled()) { LOG.debug("Created Body Plan for Query Block " + qb.getId()); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java index 8804258..d6aad9f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java @@ -54,7 +54,6 @@ }; private Mode mode; - private boolean groupKeyNotReductionKey; // no hash aggregations for group by private boolean bucketGroup; @@ -82,14 +81,13 @@ public GroupByDesc( final ArrayList outputColumnNames, final ArrayList keys, final ArrayList aggregators, - final boolean groupKeyNotReductionKey, final float groupByMemoryUsage, final float memoryThreshold, final List listGroupingSets, final boolean groupingSetsPresent, final int groupingSetsPosition, final boolean isDistinct) { - this(mode, outputColumnNames, keys, aggregators, groupKeyNotReductionKey, + this(mode, outputColumnNames, keys, aggregators, false, groupByMemoryUsage, memoryThreshold, listGroupingSets, groupingSetsPresent, groupingSetsPosition, isDistinct); } @@ -99,7 +97,6 @@ public GroupByDesc( final ArrayList outputColumnNames, final ArrayList keys, final ArrayList aggregators, - final boolean groupKeyNotReductionKey, final boolean bucketGroup, final float groupByMemoryUsage, final float memoryThreshold, @@ -112,7 +109,6 @@ public GroupByDesc( this.outputColumnNames = outputColumnNames; this.keys = keys; this.aggregators = aggregators; - this.groupKeyNotReductionKey = groupKeyNotReductionKey; this.bucketGroup = bucketGroup; this.groupByMemoryUsage = groupByMemoryUsage; this.memoryThreshold = memoryThreshold; @@ -180,7 +176,7 @@ public void setKeys(final ArrayList keys) { @Explain(displayName = "pruneGroupingSetId", displayOnlyOnTrue = true) public boolean pruneGroupingSetId() { - return groupingSetPosition >= 0 && + return groupingSetPosition >= 0 && outputColumnNames.size() != keys.size() + aggregators.size(); } @@ -229,15 +225,7 @@ public boolean isAggregate() { } return false; } - - public boolean getGroupKeyNotReductionKey() { - return groupKeyNotReductionKey; - } - - public void setGroupKeyNotReductionKey(final boolean groupKeyNotReductionKey) { - this.groupKeyNotReductionKey = groupKeyNotReductionKey; - } - + @Explain(displayName = "bucketGroup", displayOnlyOnTrue = true) public boolean getBucketGroup() { return bucketGroup; diff --git a/ql/src/test/queries/clientpositive/groupby10.q b/ql/src/test/queries/clientpositive/groupby10.q index 8b1fb52..5e78831 100644 --- a/ql/src/test/queries/clientpositive/groupby10.q +++ b/ql/src/test/queries/clientpositive/groupby10.q @@ -36,7 +36,9 @@ INSERT OVERWRITE TABLE dest2 SELECT INPUT.key, sum(substr(INPUT.value,5)), sum(d SELECT * from dest1; SELECT * from dest2; +set hive.groupby.skewindata=false; -- HIVE-3852 Multi-groupby optimization fails when same distinct column is used twice or more + EXPLAIN FROM INPUT INSERT OVERWRITE TABLE dest1 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), count(distinct substr(INPUT.value,5)) GROUP BY INPUT.key diff --git a/ql/src/test/queries/clientpositive/groupby_multi_insert_common_distinct.q b/ql/src/test/queries/clientpositive/groupby_multi_insert_common_distinct.q index b009a8b..c877814 100644 --- a/ql/src/test/queries/clientpositive/groupby_multi_insert_common_distinct.q +++ b/ql/src/test/queries/clientpositive/groupby_multi_insert_common_distinct.q @@ -18,17 +18,3 @@ insert overwrite table dest2 select key+key, count(distinct value) group by key+ select * from dest1 where key < 10; select * from dest2 where key < 20 order by key limit 10; -set hive.optimize.multigroupby.common.distincts=false; - --- no need to spray by distinct key first -explain -from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key; - -from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key; - -select * from dest1 where key < 10; -select * from dest2 where key < 20 order by key limit 10; diff --git a/ql/src/test/results/clientpositive/groupby10.q.out b/ql/src/test/results/clientpositive/groupby10.q.out index a575089..15969ad 100644 --- a/ql/src/test/results/clientpositive/groupby10.q.out +++ b/ql/src/test/results/clientpositive/groupby10.q.out @@ -46,12 +46,11 @@ INSERT OVERWRITE TABLE dest2 SELECT INPUT.key, sum(substr(INPUT.value,5)), sum(d POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -60,55 +59,30 @@ STAGE PLANS: TableScan alias: input Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: int) - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: sum(KEY._col0), sum(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -134,24 +108,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: double) Reduce Operator Tree: Group By Operator - aggregations: sum(VALUE._col0), sum(VALUE._col1) + aggregations: sum(KEY._col1:0._col0), sum(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -177,7 +150,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM INPUT @@ -268,12 +241,11 @@ INSERT OVERWRITE TABLE dest2 SELECT INPUT.key, sum(substr(INPUT.value,5)), sum(d POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -282,55 +254,30 @@ STAGE PLANS: TableScan alias: input Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: int) - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: sum(KEY._col0), sum(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -356,24 +303,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: double) Reduce Operator Tree: Group By Operator - aggregations: sum(VALUE._col0), sum(VALUE._col1) + aggregations: sum(KEY._col1:0._col0), sum(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -399,7 +345,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM INPUT @@ -479,12 +425,14 @@ POSTHOOK: Input: default@dest2 86 86 86 98 98 98 PREHOOK: query: -- HIVE-3852 Multi-groupby optimization fails when same distinct column is used twice or more + EXPLAIN FROM INPUT INSERT OVERWRITE TABLE dest1 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), count(distinct substr(INPUT.value,5)) GROUP BY INPUT.key INSERT OVERWRITE TABLE dest2 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), avg(distinct substr(INPUT.value,5)) GROUP BY INPUT.key PREHOOK: type: QUERY POSTHOOK: query: -- HIVE-3852 Multi-groupby optimization fails when same distinct column is used twice or more + EXPLAIN FROM INPUT INSERT OVERWRITE TABLE dest1 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), count(distinct substr(INPUT.value,5)) GROUP BY INPUT.key @@ -492,12 +440,10 @@ INSERT OVERWRITE TABLE dest2 SELECT INPUT.key, sum(distinct substr(INPUT.value,5 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -506,69 +452,54 @@ STAGE PLANS: TableScan alias: input Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: int) + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Forward Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: sum(DISTINCT KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash + aggregations: sum(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 Group By Operator - aggregations: sum(DISTINCT KEY._col0), avg(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0), count(VALUE._col1) - keys: KEY._col0 (type: int) - mode: final - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) + aggregations: sum(DISTINCT KEY._col1:0._col0), avg(DISTINCT KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 Stage: Stage-0 Move Operator @@ -580,39 +511,9 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: struct) - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0), avg(VALUE._col1) - keys: KEY._col0 (type: int) - mode: final - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Stage: Stage-1 Move Operator tables: @@ -623,7 +524,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-4 Stats-Aggr Operator PREHOOK: query: FROM INPUT @@ -642,10 +543,10 @@ POSTHOOK: Output: default@dest1 POSTHOOK: Output: default@dest2 POSTHOOK: Lineage: dest1.key SIMPLE [(input)input.FieldSchema(name:key, type:int, comment:null), ] POSTHOOK: Lineage: dest1.val1 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] -POSTHOOK: Lineage: dest1.val2 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest1.val2 EXPRESSION [(input)input.null, ] POSTHOOK: Lineage: dest2.key SIMPLE [(input)input.FieldSchema(name:key, type:int, comment:null), ] POSTHOOK: Lineage: dest2.val1 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] -POSTHOOK: Lineage: dest2.val2 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val2 EXPRESSION [(input)input.null, ] PREHOOK: query: SELECT * from dest1 PREHOOK: type: QUERY PREHOOK: Input: default@dest1 diff --git a/ql/src/test/results/clientpositive/groupby11.q.out b/ql/src/test/results/clientpositive/groupby11.q.out index 322140e..c55ebbf 100644 --- a/ql/src/test/results/clientpositive/groupby11.q.out +++ b/ql/src/test/results/clientpositive/groupby11.q.out @@ -34,12 +34,11 @@ INSERT OVERWRITE TABLE dest2 partition(ds='111') POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -48,55 +47,30 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string), substr(value, 5) (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Reduce Output Operator + key expressions: value (type: string), key (type: string) + sort order: ++ + Map-reduce partition columns: value (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -124,24 +98,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + key expressions: substr(value, 5) (type: string), key (type: string) + sort order: ++ + Map-reduce partition columns: substr(value, 5) (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -169,7 +142,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM src diff --git a/ql/src/test/results/clientpositive/groupby8.q.out b/ql/src/test/results/clientpositive/groupby8.q.out index 68d6e47..ea34021 100644 --- a/ql/src/test/results/clientpositive/groupby8.q.out +++ b/ql/src/test/results/clientpositive/groupby8.q.out @@ -30,12 +30,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, COUNT(DISTINCT SUBSTR(SRC.value,5)) POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -44,55 +43,30 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -118,24 +92,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -161,7 +134,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC @@ -828,12 +801,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, COUNT(DISTINCT SUBSTR(SRC.value,5)) POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -842,55 +814,30 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -916,24 +863,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -959,7 +905,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC diff --git a/ql/src/test/results/clientpositive/groupby8_map.q.out b/ql/src/test/results/clientpositive/groupby8_map.q.out index 37301f7..db472e9 100644 --- a/ql/src/test/results/clientpositive/groupby8_map.q.out +++ b/ql/src/test/results/clientpositive/groupby8_map.q.out @@ -30,12 +30,10 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, COUNT(DISTINCT SUBSTR(SRC.value,5)) POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -44,69 +42,54 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Forward Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 Stage: Stage-0 Move Operator @@ -118,39 +101,9 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Stage: Stage-1 Move Operator tables: @@ -161,7 +114,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-4 Stats-Aggr Operator PREHOOK: query: FROM SRC diff --git a/ql/src/test/results/clientpositive/groupby8_map_skew.q.out b/ql/src/test/results/clientpositive/groupby8_map_skew.q.out index 37301f7..5160158 100644 --- a/ql/src/test/results/clientpositive/groupby8_map_skew.q.out +++ b/ql/src/test/results/clientpositive/groupby8_map_skew.q.out @@ -30,12 +30,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, COUNT(DISTINCT SUBSTR(SRC.value,5)) POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -44,55 +43,42 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -118,24 +104,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -161,7 +146,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC diff --git a/ql/src/test/results/clientpositive/groupby8_noskew.q.out b/ql/src/test/results/clientpositive/groupby8_noskew.q.out index 37301f7..db472e9 100644 --- a/ql/src/test/results/clientpositive/groupby8_noskew.q.out +++ b/ql/src/test/results/clientpositive/groupby8_noskew.q.out @@ -30,12 +30,10 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, COUNT(DISTINCT SUBSTR(SRC.value,5)) POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-2 + Stage-4 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -44,69 +42,54 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Forward Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 Stage: Stage-0 Move Operator @@ -118,39 +101,9 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Stage: Stage-1 Move Operator tables: @@ -161,7 +114,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-4 Stats-Aggr Operator PREHOOK: query: FROM SRC diff --git a/ql/src/test/results/clientpositive/groupby9.q.out b/ql/src/test/results/clientpositive/groupby9.q.out index e96a22f..9c5b3b3 100644 --- a/ql/src/test/results/clientpositive/groupby9.q.out +++ b/ql/src/test/results/clientpositive/groupby9.q.out @@ -30,12 +30,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, SRC.value, COUNT(DISTINCT SUBSTR(SR POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -44,55 +43,42 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string), VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), value (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -118,24 +104,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -161,7 +146,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC @@ -829,12 +814,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, SRC.value, COUNT(DISTINCT SUBSTR(SR POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -843,55 +827,42 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: string), VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string), key (type: string) + outputColumnNames: value, key Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: value (type: string), key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -917,24 +888,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -960,7 +930,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC @@ -1628,12 +1598,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, SRC.value, COUNT(DISTINCT SUBSTR(SR POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -1642,55 +1611,42 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string), VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), value (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -1716,24 +1672,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -1759,7 +1714,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC @@ -3213,12 +3168,11 @@ INSERT OVERWRITE TABLE DEST2 SELECT SRC.key, SRC.value, COUNT(DISTINCT SUBSTR(SR POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2 + Stage-1 depends on stages: Stage-4 + Stage-5 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -3227,55 +3181,42 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: string), VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string), key (type: string) + outputColumnNames: value, key Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: value (type: string), key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -3301,24 +3242,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-4 + Stage: Stage-3 Stats-Aggr Operator - Stage: Stage-5 + Stage: Stage-4 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -3344,7 +3284,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-6 + Stage: Stage-5 Stats-Aggr Operator PREHOOK: query: FROM SRC diff --git a/ql/src/test/results/clientpositive/groupby_multi_insert_common_distinct.q.out b/ql/src/test/results/clientpositive/groupby_multi_insert_common_distinct.q.out index c6b5edc..c78cfe5 100644 --- a/ql/src/test/results/clientpositive/groupby_multi_insert_common_distinct.q.out +++ b/ql/src/test/results/clientpositive/groupby_multi_insert_common_distinct.q.out @@ -30,200 +30,6 @@ insert overwrite table dest2 select key+key, count(distinct value) group by key+ POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-3 - Stage-4 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-5 - Stage-6 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: value (type: string) - sort order: + - Map-reduce partition columns: value (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), (key + key) (type: double) - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: double) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-3 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), UDFToInteger(_col1) (type: int) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - - Stage: Stage-0 - Move Operator - tables: - replace: true - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - - Stage: Stage-4 - Stats-Aggr Operator - - Stage: Stage-5 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: double) - sort order: + - Map-reduce partition columns: _col0 (type: double) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: double) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), UDFToInteger(_col1) (type: int) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - - Stage: Stage-1 - Move Operator - tables: - replace: true - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - - Stage: Stage-6 - Stats-Aggr Operator - -PREHOOK: query: from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@dest1 -PREHOOK: Output: default@dest2 -POSTHOOK: query: from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@dest1 -POSTHOOK: Output: default@dest2 -POSTHOOK: Lineage: dest1.cnt EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] -POSTHOOK: Lineage: dest1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: dest2.cnt EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] -POSTHOOK: Lineage: dest2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -PREHOOK: query: select * from dest1 where key < 10 -PREHOOK: type: QUERY -PREHOOK: Input: default@dest1 -#### A masked pattern was here #### -POSTHOOK: query: select * from dest1 where key < 10 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@dest1 -#### A masked pattern was here #### -0 1 -2 1 -4 1 -5 1 -8 1 -9 1 -PREHOOK: query: select * from dest2 where key < 20 order by key limit 10 -PREHOOK: type: QUERY -PREHOOK: Input: default@dest2 -#### A masked pattern was here #### -POSTHOOK: query: select * from dest2 where key < 20 order by key limit 10 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@dest2 -#### A masked pattern was here #### -0 1 -10 1 -16 1 -18 1 -4 1 -8 1 -PREHOOK: query: -- no need to spray by distinct key first -explain -from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -PREHOOK: type: QUERY -POSTHOOK: query: -- no need to spray by distinct key first -explain -from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -POSTHOOK: type: QUERY -STAGE DEPENDENCIES: - Stage-2 is a root stage Stage-0 depends on stages: Stage-2 Stage-3 depends on stages: Stage-0 Stage-4 depends on stages: Stage-2 diff --git a/ql/src/test/results/clientpositive/spark/groupby10.q.out b/ql/src/test/results/clientpositive/spark/groupby10.q.out index 2bae6ae..d7b3abe 100644 --- a/ql/src/test/results/clientpositive/spark/groupby10.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby10.q.out @@ -55,29 +55,44 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: input Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: int) - Reducer 3 + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: input + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -92,12 +107,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: sum(VALUE._col0), sum(VALUE._col1) + aggregations: sum(KEY._col1:0._col0), sum(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -112,38 +127,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: sum(KEY._col0), sum(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: double) Stage: Stage-0 Move Operator @@ -268,29 +251,44 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: input Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: int) - Reducer 3 + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: input + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -305,12 +303,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: sum(VALUE._col0), sum(VALUE._col1) + aggregations: sum(KEY._col1:0._col0), sum(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -325,38 +323,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: sum(KEY._col0), sum(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: double) Stage: Stage-0 Move Operator @@ -461,12 +427,14 @@ POSTHOOK: Input: default@dest2 86 86 86 98 98 98 PREHOOK: query: -- HIVE-3852 Multi-groupby optimization fails when same distinct column is used twice or more + EXPLAIN FROM INPUT INSERT OVERWRITE TABLE dest1 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), count(distinct substr(INPUT.value,5)) GROUP BY INPUT.key INSERT OVERWRITE TABLE dest2 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), avg(distinct substr(INPUT.value,5)) GROUP BY INPUT.key PREHOOK: type: QUERY POSTHOOK: query: -- HIVE-3852 Multi-groupby optimization fails when same distinct column is used twice or more + EXPLAIN FROM INPUT INSERT OVERWRITE TABLE dest1 SELECT INPUT.key, sum(distinct substr(INPUT.value,5)), count(distinct substr(INPUT.value,5)) GROUP BY INPUT.key @@ -483,10 +451,7 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: Map 1 @@ -494,84 +459,55 @@ STAGE PLANS: TableScan alias: input Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: int) - Reducer 3 - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0), count(VALUE._col1) - keys: KEY._col0 (type: int) - mode: final - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - Reducer 4 - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0), avg(VALUE._col1) - keys: KEY._col0 (type: int) - mode: final - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Reducer 5 + Reduce Output Operator + key expressions: key (type: int), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int) + Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Forward Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: sum(DISTINCT KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash + aggregations: sum(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 Group By Operator - aggregations: sum(DISTINCT KEY._col0), avg(DISTINCT KEY._col0) - keys: VALUE._col0 (type: int) - mode: hash + aggregations: sum(DISTINCT KEY._col1:0._col0), avg(DISTINCT KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 2 Data size: 280 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: double), _col2 (type: struct) + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), UDFToInteger(_col1) (type: int), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 140 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 Stage: Stage-0 Move Operator @@ -615,10 +551,10 @@ POSTHOOK: Output: default@dest1 POSTHOOK: Output: default@dest2 POSTHOOK: Lineage: dest1.key SIMPLE [(input)input.FieldSchema(name:key, type:int, comment:null), ] POSTHOOK: Lineage: dest1.val1 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] -POSTHOOK: Lineage: dest1.val2 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest1.val2 EXPRESSION [(input)input.null, ] POSTHOOK: Lineage: dest2.key SIMPLE [(input)input.FieldSchema(name:key, type:int, comment:null), ] POSTHOOK: Lineage: dest2.val1 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] -POSTHOOK: Lineage: dest2.val2 EXPRESSION [(input)input.FieldSchema(name:value, type:string, comment:null), ] +POSTHOOK: Lineage: dest2.val2 EXPRESSION [(input)input.null, ] PREHOOK: query: SELECT * from dest1 PREHOOK: type: QUERY PREHOOK: Input: default@dest1 diff --git a/ql/src/test/results/clientpositive/spark/groupby11.q.out b/ql/src/test/results/clientpositive/spark/groupby11.q.out index 378c166..ccacda4 100644 --- a/ql/src/test/results/clientpositive/spark/groupby11.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby11.q.out @@ -43,29 +43,44 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string), substr(value, 5) (type: string) - Reducer 3 + Reduce Output Operator + key expressions: value (type: string), key (type: string) + sort order: ++ + Map-reduce partition columns: value (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: substr(value, 5) (type: string), key (type: string) + sort order: ++ + Map-reduce partition columns: substr(value, 5) (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -80,12 +95,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(VALUE._col1) + aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -100,38 +115,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(KEY._col0), count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint), _col2 (type: bigint) Stage: Stage-0 Move Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby8.q.out b/ql/src/test/results/clientpositive/spark/groupby8.q.out index 53ecd96..307395f 100644 --- a/ql/src/test/results/clientpositive/spark/groupby8.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby8.q.out @@ -39,29 +39,44 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reducer 3 + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -76,12 +91,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -96,38 +111,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) Stage: Stage-0 Move Operator @@ -828,29 +811,44 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reducer 3 + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -865,12 +863,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -885,38 +883,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) Stage: Stage-0 Move Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby8_map.q.out b/ql/src/test/results/clientpositive/spark/groupby8_map.q.out index 0517b72..a1de269 100644 --- a/ql/src/test/results/clientpositive/spark/groupby8_map.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby8_map.q.out @@ -39,10 +39,7 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 31) - Reducer 6 <- Map 1 (SORT, 31) - Reducer 3 <- Reducer 5 (GROUP, 31) - Reducer 4 <- Reducer 6 (GROUP, 31) + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 31) #### A masked pattern was here #### Vertices: Map 1 @@ -50,84 +47,55 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reducer 3 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - Reducer 4 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Reducer 5 + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Forward Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 Stage: Stage-0 Move Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby8_map_skew.q.out b/ql/src/test/results/clientpositive/spark/groupby8_map_skew.q.out index 0517b72..ba04a57 100644 --- a/ql/src/test/results/clientpositive/spark/groupby8_map_skew.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby8_map_skew.q.out @@ -39,29 +39,56 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 31) - Reducer 6 <- Map 1 (SORT, 31) - Reducer 3 <- Reducer 5 (GROUP, 31) - Reducer 4 <- Reducer 6 (GROUP, 31) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 31) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 31) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reducer 3 + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -76,12 +103,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -96,38 +123,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) Stage: Stage-0 Move Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby8_noskew.q.out b/ql/src/test/results/clientpositive/spark/groupby8_noskew.q.out index 0517b72..a1de269 100644 --- a/ql/src/test/results/clientpositive/spark/groupby8_noskew.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby8_noskew.q.out @@ -39,10 +39,7 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 31) - Reducer 6 <- Map 1 (SORT, 31) - Reducer 3 <- Reducer 5 (GROUP, 31) - Reducer 4 <- Reducer 6 (GROUP, 31) + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 31) #### A masked pattern was here #### Vertices: Map 1 @@ -50,84 +47,55 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Reducer 3 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - Reducer 4 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Reducer 5 + Reduce Output Operator + key expressions: key (type: string), substr(value, 5) (type: string) + sort order: ++ + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Forward Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash + aggregations: count(DISTINCT KEY._col1:0._col0) + keys: KEY._col0 (type: string) + mode: complete outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest2 Stage: Stage-0 Move Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby9.q.out b/ql/src/test/results/clientpositive/spark/groupby9.q.out index 9b5095a..b24afa3 100644 --- a/ql/src/test/results/clientpositive/spark/groupby9.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby9.q.out @@ -39,29 +39,56 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reducer 3 + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), value (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -76,12 +103,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -96,38 +123,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string), VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ - Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Stage: Stage-0 Move Operator @@ -829,29 +824,56 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reducer 3 + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string), key (type: string) + outputColumnNames: value, key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: value (type: string), key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -866,12 +888,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -886,38 +908,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: string), VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ - Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Stage: Stage-0 Move Operator @@ -1619,29 +1609,56 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reducer 3 + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), value (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -1656,12 +1673,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -1676,38 +1693,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string), VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ - Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Stage: Stage-0 Move Operator @@ -3196,29 +3181,56 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) + Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: substr(value, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(value, 5) (type: string) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) - Reducer 3 + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string), key (type: string) + outputColumnNames: value, key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(DISTINCT substr(value, 5)) + keys: value (type: string), key (type: string), substr(value, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -3233,12 +3245,12 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -3253,38 +3265,6 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: string), VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ - Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col2 (type: bigint) Stage: Stage-0 Move Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby_multi_insert_common_distinct.q.out b/ql/src/test/results/clientpositive/spark/groupby_multi_insert_common_distinct.q.out index c2766f8..0241cd2 100644 --- a/ql/src/test/results/clientpositive/spark/groupby_multi_insert_common_distinct.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby_multi_insert_common_distinct.q.out @@ -39,191 +39,6 @@ STAGE PLANS: Stage: Stage-2 Spark Edges: - Reducer 5 <- Map 1 (SORT, 2) - Reducer 6 <- Map 1 (SORT, 2) - Reducer 3 <- Reducer 5 (GROUP, 2) - Reducer 4 <- Reducer 6 (GROUP, 2) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: value (type: string) - sort order: + - Map-reduce partition columns: value (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), (key + key) (type: double) - Reducer 3 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: string) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), UDFToInteger(_col1) (type: int) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - Reducer 4 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: double) - mode: final - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), UDFToInteger(_col1) (type: int) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - Reducer 5 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - Reducer 6 - Reduce Operator Tree: - Forward - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col1 (type: double) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: double) - sort order: + - Map-reduce partition columns: _col0 (type: double) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) - - Stage: Stage-0 - Move Operator - tables: - replace: true - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1 - - Stage: Stage-3 - Stats-Aggr Operator - - Stage: Stage-1 - Move Operator - tables: - replace: true - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest2 - - Stage: Stage-4 - Stats-Aggr Operator - -PREHOOK: query: from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@dest1 -PREHOOK: Output: default@dest2 -POSTHOOK: query: from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@dest1 -POSTHOOK: Output: default@dest2 -POSTHOOK: Lineage: dest1.cnt EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] -POSTHOOK: Lineage: dest1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: dest2.cnt EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] -POSTHOOK: Lineage: dest2.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] -PREHOOK: query: select * from dest1 where key < 10 -PREHOOK: type: QUERY -PREHOOK: Input: default@dest1 -#### A masked pattern was here #### -POSTHOOK: query: select * from dest1 where key < 10 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@dest1 -#### A masked pattern was here #### -0 1 -2 1 -4 1 -5 1 -8 1 -9 1 -PREHOOK: query: select * from dest2 where key < 20 order by key limit 10 -PREHOOK: type: QUERY -PREHOOK: Input: default@dest2 -#### A masked pattern was here #### -POSTHOOK: query: select * from dest2 where key < 20 order by key limit 10 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@dest2 -#### A masked pattern was here #### -0 1 -10 1 -16 1 -18 1 -4 1 -8 1 -PREHOOK: query: -- no need to spray by distinct key first -explain -from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -PREHOOK: type: QUERY -POSTHOOK: query: -- no need to spray by distinct key first -explain -from src -insert overwrite table dest1 select key, count(distinct value) group by key -insert overwrite table dest2 select key+key, count(distinct value) group by key+key -POSTHOOK: type: QUERY -STAGE DEPENDENCIES: - Stage-2 is a root stage - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 - Stage-1 depends on stages: Stage-2 - Stage-4 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-2 - Spark - Edges: Reducer 2 <- Map 4 (GROUP PARTITION-LEVEL SORT, 2) Reducer 3 <- Map 5 (GROUP PARTITION-LEVEL SORT, 2) #### A masked pattern was here #### diff --git a/ql/src/test/results/clientpositive/union17.q.out b/ql/src/test/results/clientpositive/union17.q.out index 75d2253..e8f7202 100644 --- a/ql/src/test/results/clientpositive/union17.q.out +++ b/ql/src/test/results/clientpositive/union17.q.out @@ -37,12 +37,11 @@ POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage Stage-3 depends on stages: Stage-2 - Stage-4 depends on stages: Stage-3 - Stage-0 depends on stages: Stage-4 - Stage-5 depends on stages: Stage-0 - Stage-6 depends on stages: Stage-3 - Stage-1 depends on stages: Stage-6 - Stage-7 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-3 + Stage-4 depends on stages: Stage-0 + Stage-5 depends on stages: Stage-3 + Stage-1 depends on stages: Stage-5 + Stage-6 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-2 @@ -85,12 +84,29 @@ STAGE PLANS: TableScan Union Statistics: Num rows: 501 Data size: 5584 Basic stats: COMPLETE Column stats: PARTIAL - Reduce Output Operator - key expressions: substr(_col1, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(_col1, 5) (type: string) - Statistics: Num rows: 501 Data size: 5584 Basic stats: COMPLETE Column stats: PARTIAL - value expressions: _col0 (type: string), _col1 (type: string) + Group By Operator + aggregations: count(DISTINCT substr(_col1, 5)) + keys: _col0 (type: string), substr(_col1, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL + Group By Operator + aggregations: count(DISTINCT substr(_col1, 5)) + keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe TableScan alias: s2 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE @@ -100,55 +116,34 @@ STAGE PLANS: Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Union Statistics: Num rows: 501 Data size: 5584 Basic stats: COMPLETE Column stats: PARTIAL - Reduce Output Operator - key expressions: substr(_col1, 5) (type: string) - sort order: + - Map-reduce partition columns: substr(_col1, 5) (type: string) - Statistics: Num rows: 501 Data size: 5584 Basic stats: COMPLETE Column stats: PARTIAL - value expressions: _col0 (type: string), _col1 (type: string) - Reduce Operator Tree: - Forward - Statistics: Num rows: 501 Data size: 5584 Basic stats: COMPLETE Column stats: PARTIAL - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Group By Operator - aggregations: count(DISTINCT KEY._col0) - keys: VALUE._col0 (type: string), VALUE._col1 (type: string) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-4 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(DISTINCT substr(_col1, 5)) + keys: _col0 (type: string), substr(_col1, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL + Group By Operator + aggregations: count(DISTINCT substr(_col1, 5)) + keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator @@ -170,24 +165,23 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest1 - Stage: Stage-5 + Stage: Stage-4 Stats-Aggr Operator - Stage: Stage-6 + Stage: Stage-5 Map Reduce Map Operator Tree: TableScan Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL - value expressions: _col2 (type: bigint) + Statistics: Num rows: 250 Data size: 48000 Basic stats: COMPLETE Column stats: PARTIAL Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0) + aggregations: count(DISTINCT KEY._col2:0._col0) keys: KEY._col0 (type: string), KEY._col1 (type: string) - mode: final + mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator @@ -209,7 +203,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.dest2 - Stage: Stage-7 + Stage: Stage-6 Stats-Aggr Operator PREHOOK: query: FROM (select 'tst1' as key, cast(count(1) as string) as value from src s1