diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 3672c7afef..4f01bcab45 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -485,6 +485,7 @@ minillaplocal.query.files=\ colstats_date_min_max.q,\ compare_double_bigint_2.q,\ constprog_dpp.q,\ + constraints_optimization.q,\ current_date_timestamp.q,\ correlationoptimizer1.q,\ correlationoptimizer2.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java index 42e60de6a8..99bbad5439 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java @@ -167,6 +167,8 @@ public Expression getExpression(Class clazz) { throw new UnsupportedOperationException(); } + public List getKeys() { return keys; } + @Override public RelOptTable extend(List extendedFields) { throw new UnsupportedOperationException(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java index 5857f730a8..7ddf8214a6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java @@ -24,15 +24,19 @@ import java.util.Map; import java.util.Set; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import org.apache.calcite.adapter.druid.DruidQuery; import org.apache.calcite.linq4j.Ord; import org.apache.calcite.plan.RelOptTable; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Aggregate; +import org.apache.calcite.rel.core.AggregateCall; import org.apache.calcite.rel.core.CorrelationId; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.TableScan; +import org.apache.calcite.rel.metadata.RelMetadataQuery; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexBuilder; @@ -309,9 +313,52 @@ else if(rexNode instanceof RexCall return false; } } + + private ImmutableBitSet generateNewGroupset(Aggregate aggregate, ImmutableBitSet fieldsUsed) { + + ImmutableBitSet originalGroupSet = aggregate.getGroupSet(); + + if (aggregate.getGroupSets().size() > 1 || aggregate.getIndicatorCount() > 0 + || fieldsUsed.contains(originalGroupSet)) { + // if there is grouping sets, indicator or all the group keys are being used we do no need to proceed further + return originalGroupSet; + } + + final RelNode input = aggregate.getInput(); + RelMetadataQuery mq = aggregate.getCluster().getMetadataQuery(); + + // there is at least one gb key not being used, we need to find all unique keys to get to the column + // which isn't unique key + final Set uniqueKeys = mq.getUniqueKeys(input, false); + if (uniqueKeys == null || uniqueKeys.isEmpty()) { + return originalGroupSet; + } + + // we have set of unique key, get to the key which is same as group by key + ImmutableBitSet groupByUniqueKey = null; + + for (ImmutableBitSet key : uniqueKeys) { + if (aggregate.getGroupSet().contains(key)) { + groupByUniqueKey = key; + break; + } + } + + if (groupByUniqueKey == null) { + // group by keys do not represent unique keys + return originalGroupSet; + } + + // we know group by key contains primary key and there is at least one column in group by which is not being used + // if that column is not part of key it should be removed + ImmutableBitSet nonKeyColumns = aggregate.getGroupSet().except(groupByUniqueKey); + ImmutableBitSet columnsToRemove = nonKeyColumns.except(fieldsUsed); + ImmutableBitSet newGroupSet = aggregate.getGroupSet().except(columnsToRemove); + + return newGroupSet; + } + /** - * Variant of {@link #trimFields(Aggregate, ImmutableBitSet, Set)} for - * {@link org.apache.calcite.rel.logical.LogicalAggregate}. * This method replaces group by 'constant key' with group by true (boolean) * if and only if * group by doesn't have grouping sets @@ -323,49 +370,180 @@ else if(rexNode instanceof RexCall * * This is mainly done so that hive is able to push down queries with * group by 'constant key with type not supported by druid' into druid + * */ - public TrimResult trimFields(Aggregate aggregate, ImmutableBitSet fieldsUsed, - Set extraFields) { - - Aggregate newAggregate = aggregate; - if (!(aggregate.getIndicatorCount() > 0) - && !(aggregate.getGroupSet().isEmpty()) - && !fieldsUsed.contains(aggregate.getGroupSet())) { - final RelNode input = aggregate.getInput(); - final RelDataType rowType = input.getRowType(); - RexBuilder rexBuilder = aggregate.getCluster().getRexBuilder(); - final List newProjects = new ArrayList<>(); - - final List inputExprs = input.getChildExps(); - if(inputExprs == null || inputExprs.isEmpty()) { - return super.trimFields(newAggregate, fieldsUsed, extraFields); + + private Aggregate rewriteGBConstantKeys(Aggregate aggregate, ImmutableBitSet fieldsUsed, + Set extraFields) { + if ((aggregate.getIndicatorCount() > 0) + || (aggregate.getGroupSet().isEmpty()) + || fieldsUsed.contains(aggregate.getGroupSet())) { + return aggregate; + } + + final RelNode input = aggregate.getInput(); + + + final RelDataType rowType = input.getRowType(); + RexBuilder rexBuilder = aggregate.getCluster().getRexBuilder(); + final List newProjects = new ArrayList<>(); + + final List inputExprs = input.getChildExps(); + if (inputExprs == null || inputExprs.isEmpty()) { + return aggregate; + } + + boolean allConstants = true; + for (int key : aggregate.getGroupSet()) { + // getChildExprs on Join could return less number of expressions than there are coming out of join + if (inputExprs.size() <= key || !isRexLiteral(inputExprs.get(key))) { + allConstants = false; + break; } + } - boolean allConstants = true; - for(int key : aggregate.getGroupSet()) { - // getChildExprs on Join could return less number of expressions than there are coming out of join - if(inputExprs.size() <= key || !isRexLiteral(inputExprs.get(key))){ - allConstants = false; - break; + if (allConstants) { + for (int i = 0; i < rowType.getFieldCount(); i++) { + if (aggregate.getGroupSet().get(i)) { + newProjects.add(rexBuilder.makeLiteral(true)); + } else { + newProjects.add(rexBuilder.makeInputRef(input, i)); } } + relBuilder.push(input); + relBuilder.project(newProjects); + Aggregate newAggregate = new HiveAggregate(aggregate.getCluster(), aggregate.getTraitSet(), relBuilder.build(), + aggregate.getGroupSet(), null, aggregate.getAggCallList()); + return newAggregate; + } + return aggregate; + } - if (allConstants) { - for (int i = 0; i < rowType.getFieldCount(); i++) { - if (aggregate.getGroupSet().get(i)) { - newProjects.add(rexBuilder.makeLiteral(true)); - } else { - newProjects.add(rexBuilder.makeInputRef(input, i)); - } - } - relBuilder.push(input); - relBuilder.project(newProjects); - newAggregate = new HiveAggregate(aggregate.getCluster(), aggregate.getTraitSet(), relBuilder.build(), - aggregate.getGroupSet(), null, aggregate.getAggCallList()); + @Override + public TrimResult trimFields(Aggregate aggregate, ImmutableBitSet fieldsUsed, Set extraFields) { + // Fields: + // + // | sys fields | group fields | indicator fields | agg functions | + // + // Two kinds of trimming: + // + // 1. If agg rel has system fields but none of these are used, create an + // agg rel with no system fields. + // + // 2. If aggregate functions are not used, remove them. + // + // But group and indicator fields stay, even if they are not used. + + aggregate = rewriteGBConstantKeys(aggregate, fieldsUsed, extraFields); + + final RelDataType rowType = aggregate.getRowType(); + + // Compute which input fields are used. + // 1. group fields are always used + final ImmutableBitSet.Builder inputFieldsUsed = + aggregate.getGroupSet().rebuild(); + // 2. agg functions + for (AggregateCall aggCall : aggregate.getAggCallList()) { + for (int i : aggCall.getArgList()) { + inputFieldsUsed.set(i); + } + if (aggCall.filterArg >= 0) { + inputFieldsUsed.set(aggCall.filterArg); + } + } + + // Create input with trimmed columns. + final RelNode input = aggregate.getInput(); + final Set inputExtraFields = Collections.emptySet(); + final TrimResult trimResult = + trimChild(aggregate, input, inputFieldsUsed.build(), inputExtraFields); + final RelNode newInput = trimResult.left; + final Mapping inputMapping = trimResult.right; + + ImmutableBitSet updatedGroupSet = generateNewGroupset(aggregate, fieldsUsed); + final int updatedGroupCount = updatedGroupSet.cardinality(); + fieldsUsed = + fieldsUsed.union(updatedGroupSet); + + // If the input is unchanged, and we need to project all columns, + // there's nothing to do. + if (input == newInput + && fieldsUsed.equals(ImmutableBitSet.range(rowType.getFieldCount()))) { + return result(aggregate, + Mappings.createIdentity(rowType.getFieldCount())); + } + + // update the group by keys based on inputMapping + ImmutableBitSet newGroupSet = + Mappings.apply(inputMapping, updatedGroupSet); + + // Which agg calls are used by our consumer? + int originalGroupCount = aggregate.getGroupSet().cardinality(); + int j = originalGroupCount; + int usedAggCallCount = 0; + for (int i = 0; i < aggregate.getAggCallList().size(); i++) { + if (fieldsUsed.get(j++)) { + ++usedAggCallCount; } } - return super.trimFields(newAggregate, fieldsUsed, extraFields); + + // Offset due to the number of system fields having changed. + Mapping mapping = + Mappings.create( + MappingType.INVERSE_SURJECTION, + rowType.getFieldCount(), + updatedGroupCount + usedAggCallCount); + + + // if group keys were reduced, it means we didn't have grouping therefore + // we don't need to transform group sets + ImmutableList newGroupSets = null; + if(!updatedGroupSet.equals(aggregate.getGroupSet())) { + newGroupSets = ImmutableList.of(newGroupSet); + } else { + newGroupSets = ImmutableList.copyOf( + Iterables.transform(aggregate.getGroupSets(), + input1 -> Mappings.apply(inputMapping, input1))); + } + + // Populate mapping of where to find the fields. System, group key and + // indicator fields first. + int gbKeyIdx = 0; + for (j = 0; j < originalGroupCount; j++) { + if(fieldsUsed.get(j)) { + mapping.set(j, gbKeyIdx); + gbKeyIdx++; + } + } + + // Now create new agg calls, and populate mapping for them. + relBuilder.push(newInput); + final List newAggCallList = new ArrayList<>(); + j = originalGroupCount; // because lookup in fieldsUsed is done using original group count + for (AggregateCall aggCall : aggregate.getAggCallList()) { + if (fieldsUsed.get(j)) { + final ImmutableList args = + relBuilder.fields( + Mappings.apply2(inputMapping, aggCall.getArgList())); + final RexNode filterArg = aggCall.filterArg < 0 ? null + : relBuilder.field(Mappings.apply(inputMapping, aggCall.filterArg)); + RelBuilder.AggCall newAggCall = + relBuilder.aggregateCall(aggCall.getAggregation(), + aggCall.isDistinct(), aggCall.isApproximate(), + filterArg, aggCall.name, args); + mapping.set(j, updatedGroupCount + newAggCallList.size()); + newAggCallList.add(newAggCall); + } + ++j; + } + + final RelBuilder.GroupKey groupKey = + relBuilder.groupKey(newGroupSet, newGroupSets); + relBuilder.aggregate(groupKey, newAggCallList); + + return result(relBuilder.build(), mapping); } + /** * Variant of {@link #trimFields(RelNode, ImmutableBitSet, Set)} for * {@link org.apache.calcite.rel.logical.LogicalProject}. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java index 1ca1937ed9..e2881e733b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java @@ -329,7 +329,7 @@ private static double pkSelectivity(Join joinRel, RelMetadataQuery mq, boolean l private static boolean isKey(ImmutableBitSet c, RelNode rel, RelMetadataQuery mq) { boolean isKey = false; - Set keys = mq.getUniqueKeys(rel); + Set keys = mq.getUniqueKeys(rel, true); if (keys != null) { for (ImmutableBitSet key : keys) { if (key.equals(c)) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java index 3bf62c535c..c2c2854543 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java @@ -40,6 +40,7 @@ import org.apache.calcite.util.BitSets; import org.apache.calcite.util.BuiltInMethod; import org.apache.calcite.util.ImmutableBitSet; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.plan.ColStatistics; @@ -54,76 +55,11 @@ return BuiltInMetadata.UniqueKeys.DEF; } - /* - * Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col) - * - min(col) = rowCount(col) - * - * Why are we intercepting Project and not TableScan? Because if we - * have a method for TableScan, it will not know which columns to check for. - * Inferring Uniqueness for all columns is very expensive right now. The flip - * side of doing this is, it only works post Field Trimming. - */ - public Set getUniqueKeys(Project rel, RelMetadataQuery mq, boolean ignoreNulls) { - - HiveTableScan tScan = getTableScan(rel.getInput(), false); - - if (tScan == null) { - // If HiveTableScan is not found, e.g., not sequence of Project and - // Filter operators, execute the original getUniqueKeys method - - // LogicalProject maps a set of rows to a different set; - // Without knowledge of the mapping function(whether it - // preserves uniqueness), it is only safe to derive uniqueness - // info from the child of a project when the mapping is f(a) => a. - // - // Further more, the unique bitset coming from the child needs - // to be mapped to match the output of the project. - final Map mapInToOutPos = new HashMap<>(); - final List projExprs = rel.getProjects(); - final Set projUniqueKeySet = new HashSet<>(); - - // Build an input to output position map. - for (int i = 0; i < projExprs.size(); i++) { - RexNode projExpr = projExprs.get(i); - if (projExpr instanceof RexInputRef) { - mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i); - } - } - - if (mapInToOutPos.isEmpty()) { - // if there's no RexInputRef in the projected expressions - // return empty set. - return projUniqueKeySet; - } - - Set childUniqueKeySet = - mq.getUniqueKeys(rel.getInput(), ignoreNulls); - - if (childUniqueKeySet != null) { - // Now add to the projUniqueKeySet the child keys that are fully - // projected. - for (ImmutableBitSet colMask : childUniqueKeySet) { - ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder(); - boolean completeKeyProjected = true; - for (int bit : colMask) { - if (mapInToOutPos.containsKey(bit)) { - tmpMask.set(mapInToOutPos.get(bit)); - } else { - // Skip the child unique key if part of it is not - // projected. - completeKeyProjected = false; - break; - } - } - if (completeKeyProjected) { - projUniqueKeySet.add(tmpMask.build()); - } - } - } - - return projUniqueKeySet; - } + //Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col) + // - min(col) = rowCount(col) + private Set generateKeysUsingStatsEstimation(Project rel, RelMetadataQuery mq, + HiveTableScan tScan) { Map posMap = new HashMap(); int projectPos = 0; int colStatsPos = 0; @@ -140,7 +76,7 @@ double numRows = mq.getRowCount(tScan); List colStats = tScan.getColStat(BitSets - .toList(projectedCols)); + .toList(projectedCols)); Set keys = new HashSet(); colStatsPos = 0; @@ -149,14 +85,14 @@ if (cStat.getCountDistint() >= numRows) { isKey = true; } - if ( !isKey && cStat.getRange() != null && - cStat.getRange().maxValue != null && + if (!isKey && cStat.getRange() != null && + cStat.getRange().maxValue != null && cStat.getRange().minValue != null) { - double r = cStat.getRange().maxValue.doubleValue() - + double r = cStat.getRange().maxValue.doubleValue() - cStat.getRange().minValue.doubleValue() + 1; isKey = (Math.abs(numRows - r) < RelOptUtil.EPSILON); } - if ( isKey ) { + if (isKey) { ImmutableBitSet key = ImmutableBitSet.of(posMap.get(colStatsPos)); keys.add(key); } @@ -164,6 +100,91 @@ } return keys; + + } + + /* + * This API is being used for two separate things (unfortunately). Original use is to infer uniquness based + * on statistics and the second use is to infer based on key constraints. This is distinguished using ignoreNulls + * param. If true we go with stats estimation and if false we go with constraint based (used in rel trimmer to + * eliminate group by keys) + * + */ + public Set getUniqueKeys(Project rel, RelMetadataQuery mq, boolean ignoreNulls) { + + if (ignoreNulls) { //called by iskey of Hiverelmdrowcount + HiveTableScan tScan = getTableScan(rel.getInput(), false); + if (tScan != null) { + return generateKeysUsingStatsEstimation(rel, mq, tScan); + } + } + + // If HiveTableScan is not found, e.g., not sequence of Project and + // Filter operators, execute the original getUniqueKeys method + + // LogicalProject maps a set of rows to a different set; + // Without knowledge of the mapping function(whether it + // preserves uniqueness), it is only safe to derive uniqueness + // info from the child of a project when the mapping is f(a) => a. + // + // Further more, the unique bitset coming from the child needs + // to be mapped to match the output of the project. + final Map mapInToOutPos = new HashMap<>(); + final List projExprs = rel.getProjects(); + final Set projUniqueKeySet = new HashSet<>(); + + // Build an input to output position map. + for (int i = 0; i < projExprs.size(); i++) { + RexNode projExpr = projExprs.get(i); + if (projExpr instanceof RexInputRef) { + mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i); + } + } + + if (mapInToOutPos.isEmpty()) { + // if there's no RexInputRef in the projected expressions + // return empty set. + return projUniqueKeySet; + } + + Set childUniqueKeySet = + mq.getUniqueKeys(rel.getInput(), ignoreNulls); + + if (childUniqueKeySet != null) { + // Now add to the projUniqueKeySet the child keys that are fully + // projected. + for (ImmutableBitSet colMask : childUniqueKeySet) { + ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder(); + boolean completeKeyProjected = true; + for (int bit : colMask) { + if (mapInToOutPos.containsKey(bit)) { + tmpMask.set(mapInToOutPos.get(bit)); + } else { + // Skip the child unique key if part of it is not + // projected. + completeKeyProjected = false; + break; + } + } + if (completeKeyProjected) { + projUniqueKeySet.add(tmpMask.build()); + } + } + } + return projUniqueKeySet; + } + + public Set getUniqueKeys(HiveTableScan rel, RelMetadataQuery mq, + boolean ignoreNulls) { + //TODO: ignoreNulls need to be taken into account + RelOptHiveTable tbl = (RelOptHiveTable) rel.getTable(); + List keyList = tbl.getKeys(); + if (keyList != null) { + //TODO: this operation is expensive, RelOptHiveTable should be updated to keep set instead of list + Set keySet = new HashSet<>(keyList); + return keySet; + } + return null; } /* diff --git a/ql/src/test/queries/clientpositive/constraints_optimization.q b/ql/src/test/queries/clientpositive/constraints_optimization.q new file mode 100644 index 0000000000..0ee461d5f8 --- /dev/null +++ b/ql/src/test/queries/clientpositive/constraints_optimization.q @@ -0,0 +1,111 @@ +set hive.strict.checks.cartesian.product=false; + +CREATE TABLE `customer_removal_n0`( + `c_custkey` bigint, + `c_name` string, + `c_address` string, + `c_city` string, + `c_nation` string, + `c_region` string, + `c_phone` string, + `c_mktsegment` string, + primary key (`c_custkey`) disable rely); + +CREATE TABLE `dates_removal_n0`( + `d_datekey` bigint, + `d_id` bigint, + `d_date` string, + `d_dayofweek` string, + `d_month` string, + `d_year` int, + `d_yearmonthnum` int, + `d_yearmonth` string, + `d_daynuminweek` int, + `d_daynuminmonth` int, + `d_daynuminyear` int, + `d_monthnuminyear` int, + `d_weeknuminyear` int, + `d_sellingseason` string, + `d_lastdayinweekfl` int, + `d_lastdayinmonthfl` int, + `d_holidayfl` int , + `d_weekdayfl`int, + primary key (`d_datekey`, `d_id`) disable rely); + + -- group by key has single primary key + EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey; + + -- mix of primary + non-primary keys + EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey, c_nation; + + -- multiple keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_id; + + -- multiple keys + non-keys + different order + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_datekey, d_sellingseason + order by d_datekey limit 10; + + -- multiple keys in different order and mixed with non-keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey, + d_sellingseason order by d_datekey limit 10; + + -- same as above but with aggregate + EXPLAIN SELECT count(d_datekey) from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey, + d_sellingseason order by d_datekey limit 10; + + -- join + insert into dates_removal_n0(d_datekey, d_id) values(3, 0); + insert into dates_removal_n0(d_datekey, d_id) values(3, 1); + insert into customer_removal_n0 (c_custkey) values(3); + + EXPLAIN SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id; + SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id; + + -- group by keys are not primary keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_sellingseason + order by d_datekey limit 10; + + -- negative + -- with aggregate function + EXPLAIN SELECT count(c_custkey) from customer_removal_n0 where c_nation IN ('USA', 'INDIA') + group by c_custkey, c_nation; + + DROP TABLE customer_removal_n0; + DROP TABLE dates_removal_n0; + + -- group by reduction optimization + create table dest_g21 (key1 int, value1 double, primary key(key1) disable rely); + insert into dest_g21 values(1, 2), (2,2), (3, 1), (4,4), (5, null), (6, null); + + -- value1 will removed because it is unused, then whole group by will be removed because key1 is unique + explain select key1 from dest_g21 group by key1, value1; + select key1 from dest_g21 group by key1, value1; + -- same query but with filter + explain select key1 from dest_g21 where value1 > 1 group by key1, value1; + select key1 from dest_g21 where value1 > 1 group by key1, value1; + + -- only value1 will be removed because there is aggregate call + explain select count(key1) from dest_g21 group by key1, value1; + select count(key1) from dest_g21 group by key1, value1; + + explain select count(key1) from dest_g21 where value1 > 1 group by key1, value1; + select count(key1) from dest_g21 where value1 > 1 group by key1, value1; + + -- t1.key is unique even after join therefore group by = group by (t1.key) + explain select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1; + select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1; + + explain select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1; + select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1; + + -- both aggregate and one of the key1 should be removed + explain select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub; + select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub; + + -- one of the aggregate will be removed and one of the key1 will be removed + explain select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub; + select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub; + + DROP table dest_g21; + + diff --git a/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out b/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out new file mode 100644 index 0000000000..7f02040ee5 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out @@ -0,0 +1,1286 @@ +PREHOOK: query: CREATE TABLE `customer_removal_n0`( + `c_custkey` bigint, + `c_name` string, + `c_address` string, + `c_city` string, + `c_nation` string, + `c_region` string, + `c_phone` string, + `c_mktsegment` string, + primary key (`c_custkey`) disable rely) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer_removal_n0 +POSTHOOK: query: CREATE TABLE `customer_removal_n0`( + `c_custkey` bigint, + `c_name` string, + `c_address` string, + `c_city` string, + `c_nation` string, + `c_region` string, + `c_phone` string, + `c_mktsegment` string, + primary key (`c_custkey`) disable rely) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer_removal_n0 +PREHOOK: query: CREATE TABLE `dates_removal_n0`( + `d_datekey` bigint, + `d_id` bigint, + `d_date` string, + `d_dayofweek` string, + `d_month` string, + `d_year` int, + `d_yearmonthnum` int, + `d_yearmonth` string, + `d_daynuminweek` int, + `d_daynuminmonth` int, + `d_daynuminyear` int, + `d_monthnuminyear` int, + `d_weeknuminyear` int, + `d_sellingseason` string, + `d_lastdayinweekfl` int, + `d_lastdayinmonthfl` int, + `d_holidayfl` int , + `d_weekdayfl`int, + primary key (`d_datekey`, `d_id`) disable rely) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dates_removal_n0 +POSTHOOK: query: CREATE TABLE `dates_removal_n0`( + `d_datekey` bigint, + `d_id` bigint, + `d_date` string, + `d_dayofweek` string, + `d_month` string, + `d_year` int, + `d_yearmonthnum` int, + `d_yearmonth` string, + `d_daynuminweek` int, + `d_daynuminmonth` int, + `d_daynuminyear` int, + `d_monthnuminyear` int, + `d_weeknuminyear` int, + `d_sellingseason` string, + `d_lastdayinweekfl` int, + `d_lastdayinmonthfl` int, + `d_holidayfl` int , + `d_weekdayfl`int, + primary key (`d_datekey`, `d_id`) disable rely) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dates_removal_n0 +PREHOOK: query: -- group by key has single primary key + EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey +PREHOOK: type: QUERY +POSTHOOK: query: -- group by key has single primary key + EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: customer_removal_n0 + filterExpr: (c_nation) IN ('USA', 'INDIA') (type: boolean) + Filter Operator + predicate: (c_nation) IN ('USA', 'INDIA') (type: boolean) + Select Operator + expressions: c_custkey (type: bigint) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: -- mix of primary + non-primary keys + EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey, c_nation +PREHOOK: type: QUERY +POSTHOOK: query: -- mix of primary + non-primary keys + EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey, c_nation +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: customer_removal_n0 + filterExpr: (c_nation) IN ('USA', 'INDIA') (type: boolean) + Filter Operator + predicate: (c_nation) IN ('USA', 'INDIA') (type: boolean) + Select Operator + expressions: c_custkey (type: bigint) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: -- multiple keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_id +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_id +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: dates_removal_n0 + filterExpr: (d_year) IN (1985, 2004) (type: boolean) + Filter Operator + predicate: (d_year) IN (1985, 2004) (type: boolean) + Select Operator + expressions: d_datekey (type: bigint) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: -- multiple keys + non-keys + different order + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_datekey, d_sellingseason + order by d_datekey limit 10 +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple keys + non-keys + different order + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_datekey, d_sellingseason + order by d_datekey limit 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dates_removal_n0 + filterExpr: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: d_datekey (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: -- multiple keys in different order and mixed with non-keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey, + d_sellingseason order by d_datekey limit 10 +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple keys in different order and mixed with non-keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey, + d_sellingseason order by d_datekey limit 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dates_removal_n0 + filterExpr: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: d_datekey (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: -- same as above but with aggregate + EXPLAIN SELECT count(d_datekey) from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey, + d_sellingseason order by d_datekey limit 10 +PREHOOK: type: QUERY +POSTHOOK: query: -- same as above but with aggregate + EXPLAIN SELECT count(d_datekey) from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey, + d_sellingseason order by d_datekey limit 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dates_removal_n0 + filterExpr: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: d_datekey (type: bigint), d_id (type: bigint) + outputColumnNames: d_datekey, d_id + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Top N Key Operator + sort order: ++ + keys: d_datekey (type: bigint), d_id (type: bigint) + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + top n: 10 + Group By Operator + aggregations: count() + keys: d_datekey (type: bigint), d_id (type: bigint) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint), _col1 (type: bigint) + sort order: ++ + Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint) + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col2 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: bigint), KEY._col1 (type: bigint) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: bigint) + sort order: + + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- join + insert into dates_removal_n0(d_datekey, d_id) values(3, 0) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@dates_removal_n0 +POSTHOOK: query: -- join + insert into dates_removal_n0(d_datekey, d_id) values(3, 0) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@dates_removal_n0 +POSTHOOK: Lineage: dates_removal_n0.d_date SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_datekey SCRIPT [] +POSTHOOK: Lineage: dates_removal_n0.d_daynuminmonth SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_daynuminweek SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_daynuminyear SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_dayofweek SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_holidayfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_id SCRIPT [] +POSTHOOK: Lineage: dates_removal_n0.d_lastdayinmonthfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_lastdayinweekfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_month SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_monthnuminyear SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_sellingseason SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_weekdayfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_weeknuminyear SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_year SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_yearmonth SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_yearmonthnum SIMPLE [] +PREHOOK: query: insert into dates_removal_n0(d_datekey, d_id) values(3, 1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@dates_removal_n0 +POSTHOOK: query: insert into dates_removal_n0(d_datekey, d_id) values(3, 1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@dates_removal_n0 +POSTHOOK: Lineage: dates_removal_n0.d_date SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_datekey SCRIPT [] +POSTHOOK: Lineage: dates_removal_n0.d_daynuminmonth SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_daynuminweek SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_daynuminyear SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_dayofweek SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_holidayfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_id SCRIPT [] +POSTHOOK: Lineage: dates_removal_n0.d_lastdayinmonthfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_lastdayinweekfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_month SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_monthnuminyear SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_sellingseason SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_weekdayfl SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_weeknuminyear SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_year SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_yearmonth SIMPLE [] +POSTHOOK: Lineage: dates_removal_n0.d_yearmonthnum SIMPLE [] +PREHOOK: query: insert into customer_removal_n0 (c_custkey) values(3) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@customer_removal_n0 +POSTHOOK: query: insert into customer_removal_n0 (c_custkey) values(3) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@customer_removal_n0 +POSTHOOK: Lineage: customer_removal_n0.c_address SIMPLE [] +POSTHOOK: Lineage: customer_removal_n0.c_city SIMPLE [] +POSTHOOK: Lineage: customer_removal_n0.c_custkey SCRIPT [] +POSTHOOK: Lineage: customer_removal_n0.c_mktsegment SIMPLE [] +POSTHOOK: Lineage: customer_removal_n0.c_name SIMPLE [] +POSTHOOK: Lineage: customer_removal_n0.c_nation SIMPLE [] +POSTHOOK: Lineage: customer_removal_n0.c_phone SIMPLE [] +POSTHOOK: Lineage: customer_removal_n0.c_region SIMPLE [] +PREHOOK: query: EXPLAIN SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dates_removal_n0 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_datekey (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: customer_removal_n0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: c_custkey (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_removal_n0 +PREHOOK: Input: default@dates_removal_n0 +#### A masked pattern was here #### +POSTHOOK: query: SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_removal_n0 +POSTHOOK: Input: default@dates_removal_n0 +#### A masked pattern was here #### +3 +3 +PREHOOK: query: -- group by keys are not primary keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_sellingseason + order by d_datekey limit 10 +PREHOOK: type: QUERY +POSTHOOK: query: -- group by keys are not primary keys + EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_sellingseason + order by d_datekey limit 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dates_removal_n0 + filterExpr: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 2 Data size: 104 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (d_year) IN (1985, 2004) (type: boolean) + Statistics: Num rows: 1 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_datekey (type: bigint), d_sellingseason (type: string) + outputColumnNames: d_datekey, d_sellingseason + Statistics: Num rows: 1 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + sort order: ++ + keys: d_datekey (type: bigint), d_sellingseason (type: string) + Statistics: Num rows: 1 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + top n: 10 + Group By Operator + keys: d_datekey (type: bigint), d_sellingseason (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: bigint), _col1 (type: string) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: bigint), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 10 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: -- negative + -- with aggregate function + EXPLAIN SELECT count(c_custkey) from customer_removal_n0 where c_nation IN ('USA', 'INDIA') + group by c_custkey, c_nation +PREHOOK: type: QUERY +POSTHOOK: query: -- negative + -- with aggregate function + EXPLAIN SELECT count(c_custkey) from customer_removal_n0 where c_nation IN ('USA', 'INDIA') + group by c_custkey, c_nation +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: customer_removal_n0 + filterExpr: (c_nation) IN ('USA', 'INDIA') (type: boolean) + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (c_nation) IN ('USA', 'INDIA') (type: boolean) + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: c_custkey (type: bigint) + outputColumnNames: c_custkey + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: c_custkey (type: bigint) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: bigint) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: DROP TABLE customer_removal_n0 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@customer_removal_n0 +PREHOOK: Output: default@customer_removal_n0 +POSTHOOK: query: DROP TABLE customer_removal_n0 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@customer_removal_n0 +POSTHOOK: Output: default@customer_removal_n0 +PREHOOK: query: DROP TABLE dates_removal_n0 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@dates_removal_n0 +PREHOOK: Output: default@dates_removal_n0 +POSTHOOK: query: DROP TABLE dates_removal_n0 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@dates_removal_n0 +POSTHOOK: Output: default@dates_removal_n0 +PREHOOK: query: -- group by reduction optimization + create table dest_g21 (key1 int, value1 double, primary key(key1) disable rely) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dest_g21 +POSTHOOK: query: -- group by reduction optimization + create table dest_g21 (key1 int, value1 double, primary key(key1) disable rely) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dest_g21 +PREHOOK: query: insert into dest_g21 values(1, 2), (2,2), (3, 1), (4,4), (5, null), (6, null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@dest_g21 +POSTHOOK: query: insert into dest_g21 values(1, 2), (2,2), (3, 1), (4,4), (5, null), (6, null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@dest_g21 +POSTHOOK: Lineage: dest_g21.key1 SCRIPT [] +POSTHOOK: Lineage: dest_g21.value1 SCRIPT [] +PREHOOK: query: -- value1 will removed because it is unused, then whole group by will be removed because key1 is unique + explain select key1 from dest_g21 group by key1, value1 +PREHOOK: type: QUERY +POSTHOOK: query: -- value1 will removed because it is unused, then whole group by will be removed because key1 is unique + explain select key1 from dest_g21 group by key1, value1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: dest_g21 + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: select key1 from dest_g21 group by key1, value1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select key1 from dest_g21 group by key1, value1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 +2 +3 +4 +5 +6 +PREHOOK: query: -- same query but with filter + explain select key1 from dest_g21 where value1 > 1 group by key1, value1 +PREHOOK: type: QUERY +POSTHOOK: query: -- same query but with filter + explain select key1 from dest_g21 where value1 > 1 group by key1, value1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: dest_g21 + filterExpr: (value1 > 1.0D) (type: boolean) + Filter Operator + predicate: (value1 > 1.0D) (type: boolean) + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: select key1 from dest_g21 where value1 > 1 group by key1, value1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select key1 from dest_g21 where value1 > 1 group by key1, value1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 +2 +4 +PREHOOK: query: -- only value1 will be removed because there is aggregate call + explain select count(key1) from dest_g21 group by key1, value1 +PREHOOK: type: QUERY +POSTHOOK: query: -- only value1 will be removed because there is aggregate call + explain select count(key1) from dest_g21 group by key1, value1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dest_g21 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: key1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: key1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(key1) from dest_g21 group by key1, value1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select count(key1) from dest_g21 group by key1, value1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 +1 +1 +1 +1 +1 +PREHOOK: query: explain select count(key1) from dest_g21 where value1 > 1 group by key1, value1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(key1) from dest_g21 where value1 > 1 group by key1, value1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dest_g21 + filterExpr: (value1 > 1.0D) (type: boolean) + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (value1 > 1.0D) (type: boolean) + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: key1 + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: key1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(key1) from dest_g21 where value1 > 1 group by key1, value1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select count(key1) from dest_g21 where value1 > 1 group by key1, value1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 +1 +1 +PREHOOK: query: -- t1.key is unique even after join therefore group by = group by (t1.key) + explain select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +PREHOOK: type: QUERY +POSTHOOK: query: -- t1.key is unique even after join therefore group by = group by (t1.key) + explain select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: (value1 > 2.0D) (type: boolean) + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (value1 > 2.0D) (type: boolean) + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +4 +PREHOOK: query: explain select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Map 4 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: (value1 > 2.0D) (type: boolean) + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (value1 > 2.0D) (type: boolean) + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 +PREHOOK: query: -- both aggregate and one of the key1 should be removed + explain select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub +PREHOOK: type: QUERY +POSTHOOK: query: -- both aggregate and one of the key1 should be removed + explain select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: dest_g21 + filterExpr: (value1 < 4.5D) (type: boolean) + Filter Operator + predicate: (value1 < 4.5D) (type: boolean) + Select Operator + expressions: key1 (type: int) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 +2 +3 +4 +PREHOOK: query: -- one of the aggregate will be removed and one of the key1 will be removed + explain select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub +PREHOOK: type: QUERY +POSTHOOK: query: -- one of the aggregate will be removed and one of the key1 will be removed + explain select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dest_g21 + filterExpr: (value1 < 4.5D) (type: boolean) + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (value1 < 4.5D) (type: boolean) + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int) + outputColumnNames: key1 + Statistics: Num rows: 6 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(key1) + keys: key1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub +PREHOOK: type: QUERY +PREHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +POSTHOOK: query: select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest_g21 +#### A masked pattern was here #### +1 1 +3 3 +4 4 +2 2 +PREHOOK: query: DROP table dest_g21 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@dest_g21 +PREHOOK: Output: default@dest_g21 +POSTHOOK: query: DROP table dest_g21 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@dest_g21 +POSTHOOK: Output: default@dest_g21