From 089de91b699e817ab0b68cf35eeaa909e215d532 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Thu, 30 Apr 2015 18:33:09 -0700 Subject: [PATCH] HIVE-10568 : Select count(distinct()) can have more optimal execution plan --- .../rules/HiveExpandDistinctAggregatesRule.java | 276 +++++++++++++++++++++ .../hadoop/hive/ql/parse/CalcitePlanner.java | 12 +- .../clientpositive/tez/limit_pushdown.q.out | 31 +-- ql/src/test/results/clientpositive/tez/mrr.q.out | 52 ++-- .../clientpositive/tez/vector_count_distinct.q.out | 28 ++- .../clientpositive/tez/vectorization_limit.q.out | 31 +-- .../tez/vectorized_distinct_gby.q.out | 51 ++-- 7 files changed, 410 insertions(+), 71 deletions(-) create mode 100644 ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java new file mode 100644 index 0000000..d1ff819 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java @@ -0,0 +1,276 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Aggregate; +import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.core.RelFactories; +import org.apache.calcite.rel.metadata.RelColumnOrigin; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.calcite.util.Pair; +import org.apache.calcite.util.Util; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Planner rule that expands distinct aggregates + * (such as {@code COUNT(DISTINCT x)}) from a + * {@link org.apache.calcite.rel.core.Aggregate}. + * + *

How this is done depends upon the arguments to the function. If all + * functions have the same argument + * (e.g. {@code COUNT(DISTINCT x), SUM(DISTINCT x)} both have the argument + * {@code x}) then one extra {@link org.apache.calcite.rel.core.Aggregate} is + * sufficient. + * + *

If there are multiple arguments + * (e.g. {@code COUNT(DISTINCT x), COUNT(DISTINCT y)}) + * the rule creates separate {@code Aggregate}s and combines using a + * {@link org.apache.calcite.rel.core.Join}. + */ + +// Stripped down version of org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule +// This is adapted for Hive, but should eventually be deleted from Hive and make use of above. + +public final class HiveExpandDistinctAggregatesRule extends RelOptRule { + //~ Static fields/initializers --------------------------------------------- + + /** The default instance of the rule; operates only on logical expressions. */ + public static final HiveExpandDistinctAggregatesRule INSTANCE = + new HiveExpandDistinctAggregatesRule(HiveAggregate.class, + HiveProject.DEFAULT_PROJECT_FACTORY); + + private static RelFactories.ProjectFactory projFactory; + + //~ Constructors ----------------------------------------------------------- + + public HiveExpandDistinctAggregatesRule( + Class clazz,RelFactories.ProjectFactory projectFactory) { + super(operand(clazz, any())); + projFactory = projectFactory; + } + + //~ Methods ---------------------------------------------------------------- + + @Override + public void onMatch(RelOptRuleCall call) { + final Aggregate aggregate = call.rel(0); + if (!aggregate.containsDistinctCall()) { + return; + } + + // Find all of the agg expressions. We use a LinkedHashSet to ensure + // determinism. + int nonDistinctCount = 0; + Set> argListSets = new LinkedHashSet>(); + for (AggregateCall aggCall : aggregate.getAggCallList()) { + if (!aggCall.isDistinct()) { + ++nonDistinctCount; + continue; + } + ArrayList argList = new ArrayList(); + for (Integer arg : aggCall.getArgList()) { + argList.add(arg); + Set colOrigs = RelMetadataQuery.getColumnOrigins(aggregate, arg); + if (null != colOrigs) { + for (RelColumnOrigin colOrig : colOrigs) { + RelOptHiveTable hiveTbl = (RelOptHiveTable)colOrig.getOriginTable(); + if(hiveTbl.getPartColInfoMap().containsKey(colOrig.getOriginColumnOrdinal())) { + // Encountered partitioning column, this will be better handled by MetadataOnly optimizer. + return; + } + } + } + } + argListSets.add(argList); + } + Util.permAssert(argListSets.size() > 0, "containsDistinctCall lied"); + + // If all of the agg expressions are distinct and have the same + // arguments then we can use a more efficient form. + if ((nonDistinctCount == 0) && (argListSets.size() == 1)) { + RelNode converted = + convertMonopole( + aggregate, + argListSets.iterator().next()); + call.transformTo(converted); + return; + } + } + + /** + * Converts an aggregate relational expression that contains just one + * distinct aggregate function (or perhaps several over the same arguments) + * and no non-distinct aggregate functions. + */ + private RelNode convertMonopole( + Aggregate aggregate, + List argList) { + // For example, + // SELECT deptno, COUNT(DISTINCT sal), SUM(DISTINCT sal) + // FROM emp + // GROUP BY deptno + // + // becomes + // + // SELECT deptno, COUNT(distinct_sal), SUM(distinct_sal) + // FROM ( + // SELECT DISTINCT deptno, sal AS distinct_sal + // FROM EMP GROUP BY deptno) + // GROUP BY deptno + + // Project the columns of the GROUP BY plus the arguments + // to the agg function. + Map sourceOf = new HashMap(); + final Aggregate distinct = + createSelectDistinct(aggregate, argList, sourceOf); + + // Create an aggregate on top, with the new aggregate list. + final List newAggCalls = + Lists.newArrayList(aggregate.getAggCallList()); + rewriteAggCalls(newAggCalls, argList, sourceOf); + final int cardinality = aggregate.getGroupSet().cardinality(); + return aggregate.copy(aggregate.getTraitSet(), distinct, + aggregate.indicator, ImmutableBitSet.range(cardinality), null, + newAggCalls); + } + + private static void rewriteAggCalls( + List newAggCalls, + List argList, + Map sourceOf) { + // Rewrite the agg calls. Each distinct agg becomes a non-distinct call + // to the corresponding field from the right; for example, + // "COUNT(DISTINCT e.sal)" becomes "COUNT(distinct_e.sal)". + for (int i = 0; i < newAggCalls.size(); i++) { + final AggregateCall aggCall = newAggCalls.get(i); + + // Ignore agg calls which are not distinct or have the wrong set + // arguments. If we're rewriting aggs whose args are {sal}, we will + // rewrite COUNT(DISTINCT sal) and SUM(DISTINCT sal) but ignore + // COUNT(DISTINCT gender) or SUM(sal). + if (!aggCall.isDistinct()) { + continue; + } + if (!aggCall.getArgList().equals(argList)) { + continue; + } + + // Re-map arguments. + final int argCount = aggCall.getArgList().size(); + final List newArgs = new ArrayList(argCount); + for (int j = 0; j < argCount; j++) { + final Integer arg = aggCall.getArgList().get(j); + newArgs.add(sourceOf.get(arg)); + } + final AggregateCall newAggCall = + new AggregateCall( + aggCall.getAggregation(), + false, + newArgs, + aggCall.getType(), + aggCall.getName()); + newAggCalls.set(i, newAggCall); + } + } + + /** + * Given an {@link org.apache.calcite.rel.logical.LogicalAggregate} + * and the ordinals of the arguments to a + * particular call to an aggregate function, creates a 'select distinct' + * relational expression which projects the group columns and those + * arguments but nothing else. + * + *

For example, given + * + *

+ *
select f0, count(distinct f1), count(distinct f2)
+   * from t group by f0
+ *
+ * + * and the arglist + * + *
{2}
+ * + * returns + * + *
+ *
select distinct f0, f2 from t
+ *
+ * + * ' + * + *

The sourceOf map is populated with the source of each + * column; in this case sourceOf.get(0) = 0, and sourceOf.get(1) = 2.

+ * + * @param aggregate Aggregate relational expression + * @param argList Ordinals of columns to make distinct + * @param sourceOf Out parameter, is populated with a map of where each + * output field came from + * @return Aggregate relational expression which projects the required + * columns + */ + private static Aggregate createSelectDistinct( + Aggregate aggregate, + List argList, + Map sourceOf) { + final List> projects = + new ArrayList>(); + final RelNode child = aggregate.getInput(); + final List childFields = + child.getRowType().getFieldList(); + for (int i : aggregate.getGroupSet()) { + sourceOf.put(i, projects.size()); + projects.add(RexInputRef.of2(i, childFields)); + } + for (Integer arg : argList) { + if (sourceOf.get(arg) != null) { + continue; + } + sourceOf.put(arg, projects.size()); + projects.add(RexInputRef.of2(arg, childFields)); + } + final RelNode project = + projFactory.createProject(child, Pair.left(projects), Pair.right(projects)); + + // Get the distinct values of the GROUP BY fields and the arguments + // to the agg functions. + return aggregate.copy(aggregate.getTraitSet(), project, false, + ImmutableBitSet.range(projects.size()), + null, ImmutableList.of()); + } +} + +// End AggregateExpandDistinctAggregatesRule.java \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 49ad6ad..2814083 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -137,6 +137,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveExpandDistinctAggregatesRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterProjectTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterSetOpTransposeRule; @@ -266,7 +267,7 @@ Operator genOPTree(ASTNode ast, PlannerContext plannerCtx) throws SemanticExcept LOG.info("CBO Succeeded; optimized logical plan."); this.ctx.setCboInfo("Plan optimized by CBO."); LOG.debug(newAST.dump()); - } + } } catch (Exception e) { boolean isMissingStats = noColsMissingStats.get() > 0; if (isMissingStats) { @@ -900,6 +901,15 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv // Partition Pruning; otherwise Expression evaluation may try to execute // corelated sub query. + //0. Distinct aggregate rewrite + // Run this optimization early, since it is expanding the operator pipeline. + if (conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { + // Its not clear, if this rewrite is always performant on MR, since extra map phase + // introduced for 2nd MR job may offset gains of this multi-stage aggregation. + // We need a cost model for MR to enable this on MR. + basePlan = hepPlan(basePlan, true, mdProvider, HiveExpandDistinctAggregatesRule.INSTANCE); + } + // 1. Push Down Semi Joins basePlan = hepPlan(basePlan, true, mdProvider, SemiJoinJoinTransposeRule.INSTANCE, SemiJoinFilterTransposeRule.INSTANCE, SemiJoinProjectTransposeRule.INSTANCE); diff --git a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out index 7038b4d..2a41aae 100644 --- a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out @@ -476,35 +476,38 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: tinyint), _col1 (type: double) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint), _col1 (type: double) sort order: ++ Map-reduce partition columns: _col0 (type: tinyint) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE - TopN Hash Memory Usage: 0.3 Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: tinyint) + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 20 - Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/tez/mrr.q.out b/ql/src/test/results/clientpositive/tez/mrr.q.out index ac5ac5c..d90b27f 100644 --- a/ql/src/test/results/clientpositive/tez/mrr.q.out +++ b/ql/src/test/results/clientpositive/tez/mrr.q.out @@ -452,10 +452,9 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: string), _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) @@ -465,25 +464,30 @@ STAGE PLANS: Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col1 (type: bigint) - sort order: + - Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: string) + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: bigint) + sort order: + + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string) Reducer 4 Reduce Operator Tree: Select Operator expressions: VALUE._col0 (type: string), KEY.reducesinkkey0 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -855,10 +859,9 @@ STAGE PLANS: Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE HybridGraceHashJoin: true Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: string), _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) @@ -885,25 +888,30 @@ STAGE PLANS: Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col1 (type: bigint) - sort order: + - Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: string) + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: bigint) + sort order: + + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string) Reducer 3 Reduce Operator Tree: Select Operator expressions: VALUE._col0 (type: string), KEY.reducesinkkey0 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 137 Data size: 1455 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 68 Data size: 722 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git a/ql/src/test/results/clientpositive/tez/vector_count_distinct.q.out b/ql/src/test/results/clientpositive/tez/vector_count_distinct.q.out index f1da471..e6d34ff 100644 --- a/ql/src/test/results/clientpositive/tez/vector_count_distinct.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_count_distinct.q.out @@ -1248,6 +1248,7 @@ STAGE PLANS: Tez Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1260,30 +1261,47 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col0) keys: _col0 (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + keys: KEY._col0 (type: int) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1000 Data size: 1752000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/tez/vectorization_limit.q.out b/ql/src/test/results/clientpositive/tez/vectorization_limit.q.out index d815938..1c5b51f 100644 --- a/ql/src/test/results/clientpositive/tez/vectorization_limit.q.out +++ b/ql/src/test/results/clientpositive/tez/vectorization_limit.q.out @@ -345,36 +345,39 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: tinyint), _col1 (type: double) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint), _col1 (type: double) sort order: ++ Map-reduce partition columns: _col0 (type: tinyint) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE - TopN Hash Memory Usage: 0.3 Execution mode: vectorized Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: tinyint) + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 20 - Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/tez/vectorized_distinct_gby.q.out b/ql/src/test/results/clientpositive/tez/vectorized_distinct_gby.q.out index 90c9934..cf95c8a 100644 --- a/ql/src/test/results/clientpositive/tez/vectorized_distinct_gby.q.out +++ b/ql/src/test/results/clientpositive/tez/vectorized_distinct_gby.q.out @@ -41,31 +41,35 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: sum(DISTINCT _col0), count(DISTINCT _col0) - bucketGroup: true keys: _col0 (type: int) - mode: hash - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE + mode: final + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col0), count(_col0) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint) Execution mode: vectorized Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: sum(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0) + aggregations: sum(VALUE._col0), count(VALUE._col1) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized Stage: Stage-0 Fetch Operator @@ -95,6 +99,7 @@ STAGE PLANS: Tez Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -107,20 +112,36 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: sum(DISTINCT _col0), count(DISTINCT _col0), avg(DISTINCT _col0), std(DISTINCT _col0) keys: _col0 (type: int) mode: hash - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0 Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: sum(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), avg(DISTINCT KEY._col0:2._col0), std(DISTINCT KEY._col0:3._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col0), count(_col0), avg(_col0), std(_col0) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: struct), _col3 (type: struct) + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), count(VALUE._col1), avg(VALUE._col2), std(VALUE._col3) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3 Statistics: Num rows: 1 Data size: 32 Basic stats: COMPLETE Column stats: NONE -- 1.7.12.4 (Apple Git-37)