diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlVarianceAggFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlVarianceAggFunction.java new file mode 100644 index 0000000000..9298e51ebf --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlVarianceAggFunction.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.functions; + +import org.apache.calcite.sql.SqlAggFunction; +import org.apache.calcite.sql.SqlFunctionCategory; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.type.SqlOperandTypeChecker; +import org.apache.calcite.sql.type.SqlOperandTypeInference; +import org.apache.calcite.sql.type.SqlReturnTypeInference; + +/** + * Aggregation function to represent: stddev_pop, stddev_samp, var_pop, var_samp. + */ +public class HiveSqlVarianceAggFunction extends SqlAggFunction { + + public HiveSqlVarianceAggFunction(String name, SqlKind kind, SqlReturnTypeInference returnTypeInference, + SqlOperandTypeInference operandTypeInference, SqlOperandTypeChecker operandTypeChecker) { + super(name, null, kind, returnTypeInference, operandTypeInference, + operandTypeChecker, SqlFunctionCategory.NUMERIC, false, false); + assert kind == SqlKind.STDDEV_POP || kind == SqlKind.STDDEV_SAMP || + kind == SqlKind.VAR_POP || kind == SqlKind.VAR_SAMP; + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateReduceFunctionsRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateReduceFunctionsRule.java new file mode 100644 index 0000000000..e00317f2ac --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateReduceFunctionsRule.java @@ -0,0 +1,615 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Aggregate; +import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFactory; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexLiteral; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlAggFunction; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.apache.calcite.sql.type.SqlTypeUtil; +import org.apache.calcite.tools.RelBuilder; +import org.apache.calcite.util.CompositeList; +import org.apache.calcite.util.ImmutableIntList; +import org.apache.calcite.util.Util; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; +import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlCountAggFunction; +import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumAggFunction; +import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumEmptyIsZeroAggFunction; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * This rule is a copy of {@link org.apache.calcite.rel.rules.AggregateReduceFunctionsRule} + * that regenerates Hive specific aggregate operators. + * + * TODO: When CALCITE-2216 is completed, we should be able to remove much of this code and + * just override the relevant methods. + * + * Planner rule that reduces aggregate functions in + * {@link org.apache.calcite.rel.core.Aggregate}s to simpler forms. + * + *

Rewrites: + *

+ */ +public class HiveAggregateReduceFunctionsRule extends RelOptRule { + //~ Static fields/initializers --------------------------------------------- + + /** The singleton. */ + public static final HiveAggregateReduceFunctionsRule INSTANCE = + new HiveAggregateReduceFunctionsRule(); + + //~ Constructors ----------------------------------------------------------- + + /** Creates an HiveAggregateReduceFunctionsRule. */ + public HiveAggregateReduceFunctionsRule() { + super(operand(HiveAggregate.class, any()), + HiveRelFactories.HIVE_BUILDER, null); + } + + //~ Methods ---------------------------------------------------------------- + + @Override public boolean matches(RelOptRuleCall call) { + if (!super.matches(call)) { + return false; + } + Aggregate oldAggRel = (Aggregate) call.rels[0]; + return containsAvgStddevVarCall(oldAggRel.getAggCallList()); + } + + public void onMatch(RelOptRuleCall ruleCall) { + Aggregate oldAggRel = (Aggregate) ruleCall.rels[0]; + reduceAggs(ruleCall, oldAggRel); + } + + /** + * Returns whether any of the aggregates are calls to AVG, STDDEV_*, VAR_*. + * + * @param aggCallList List of aggregate calls + */ + private boolean containsAvgStddevVarCall(List aggCallList) { + for (AggregateCall call : aggCallList) { + if (isReducible(call.getAggregation().getKind())) { + return true; + } + } + return false; + } + + /** + * Returns whether the aggregate call is a reducible function + */ + private boolean isReducible(final SqlKind kind) { + if (SqlKind.AVG_AGG_FUNCTIONS.contains(kind)) { + return true; + } + switch (kind) { + case SUM: + return true; + } + return false; + } + + /** + * Reduces all calls to AVG, STDDEV_POP, STDDEV_SAMP, VAR_POP, VAR_SAMP in + * the aggregates list to. + * + *

It handles newly generated common subexpressions since this was done + * at the sql2rel stage. + */ + private void reduceAggs( + RelOptRuleCall ruleCall, + Aggregate oldAggRel) { + RexBuilder rexBuilder = oldAggRel.getCluster().getRexBuilder(); + + List oldCalls = oldAggRel.getAggCallList(); + final int groupCount = oldAggRel.getGroupCount(); + final int indicatorCount = oldAggRel.getIndicatorCount(); + + final List newCalls = Lists.newArrayList(); + final Map aggCallMapping = Maps.newHashMap(); + + final List projList = Lists.newArrayList(); + + // pass through group key (+ indicators if present) + for (int i = 0; i < groupCount + indicatorCount; ++i) { + projList.add( + rexBuilder.makeInputRef( + getFieldType(oldAggRel, i), + i)); + } + + // List of input expressions. If a particular aggregate needs more, it + // will add an expression to the end, and we will create an extra + // project. + final RelBuilder relBuilder = ruleCall.builder(); + relBuilder.push(oldAggRel.getInput()); + final List inputExprs = new ArrayList<>(relBuilder.fields()); + + // create new agg function calls and rest of project list together + for (AggregateCall oldCall : oldCalls) { + projList.add( + reduceAgg( + oldAggRel, oldCall, newCalls, aggCallMapping, inputExprs)); + } + + final int extraArgCount = + inputExprs.size() - relBuilder.peek().getRowType().getFieldCount(); + if (extraArgCount > 0) { + relBuilder.project(inputExprs, + CompositeList.of( + relBuilder.peek().getRowType().getFieldNames(), + Collections.nCopies(extraArgCount, null))); + } + newAggregateRel(relBuilder, oldAggRel, newCalls); + relBuilder.project(projList, oldAggRel.getRowType().getFieldNames()); + ruleCall.transformTo(relBuilder.build()); + } + + private RexNode reduceAgg( + Aggregate oldAggRel, + AggregateCall oldCall, + List newCalls, + Map aggCallMapping, + List inputExprs) { + final SqlKind kind = oldCall.getAggregation().getKind(); + if (isReducible(kind)) { + switch (kind) { + case SUM: + // replace original SUM(x) with + // case COUNT(x) when 0 then null else SUM0(x) end + return reduceSum(oldAggRel, oldCall, newCalls, aggCallMapping); + case AVG: + // replace original AVG(x) with SUM(x) / COUNT(x) + return reduceAvg(oldAggRel, oldCall, newCalls, aggCallMapping, inputExprs); + case STDDEV_POP: + // replace original STDDEV_POP(x) with + // SQRT( + // (SUM(x * x) - SUM(x) * SUM(x) / COUNT(x)) + // / COUNT(x)) + return reduceStddev(oldAggRel, oldCall, true, true, newCalls, + aggCallMapping, inputExprs); + case STDDEV_SAMP: + // replace original STDDEV_SAMP(x) with + // SQRT( + // (SUM(x * x) - SUM(x) * SUM(x) / COUNT(x)) + // / CASE COUNT(x) WHEN 1 THEN NULL ELSE COUNT(x) - 1 END) + return reduceStddev(oldAggRel, oldCall, false, true, newCalls, + aggCallMapping, inputExprs); + case VAR_POP: + // replace original VAR_POP(x) with + // (SUM(x * x) - SUM(x) * SUM(x) / COUNT(x)) + // / COUNT(x) + return reduceStddev(oldAggRel, oldCall, true, false, newCalls, + aggCallMapping, inputExprs); + case VAR_SAMP: + // replace original VAR_SAMP(x) with + // (SUM(x * x) - SUM(x) * SUM(x) / COUNT(x)) + // / CASE COUNT(x) WHEN 1 THEN NULL ELSE COUNT(x) - 1 END + return reduceStddev(oldAggRel, oldCall, false, false, newCalls, + aggCallMapping, inputExprs); + default: + throw Util.unexpected(kind); + } + } else { + // anything else: preserve original call + RexBuilder rexBuilder = oldAggRel.getCluster().getRexBuilder(); + final int nGroups = oldAggRel.getGroupCount(); + List oldArgTypes = + SqlTypeUtil.projectTypes( + oldAggRel.getInput().getRowType(), oldCall.getArgList()); + return rexBuilder.addAggCall(oldCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + oldArgTypes); + } + } + + private AggregateCall createAggregateCallWithBinding( + RelDataTypeFactory typeFactory, + SqlAggFunction aggFunction, + RelDataType operandType, + Aggregate oldAggRel, + AggregateCall oldCall, + int argOrdinal) { + final Aggregate.AggCallBinding binding = + new Aggregate.AggCallBinding(typeFactory, aggFunction, + ImmutableList.of(operandType), oldAggRel.getGroupCount(), + oldCall.filterArg >= 0); + return AggregateCall.create(aggFunction, + oldCall.isDistinct(), + oldCall.isApproximate(), + ImmutableIntList.of(argOrdinal), + oldCall.filterArg, + aggFunction.inferReturnType(binding), + null); + } + + private RexNode reduceAvg( + Aggregate oldAggRel, + AggregateCall oldCall, + List newCalls, + Map aggCallMapping, + List inputExprs) { + final int nGroups = oldAggRel.getGroupCount(); + final RexBuilder rexBuilder = oldAggRel.getCluster().getRexBuilder(); + final int iAvgInput = oldCall.getArgList().get(0); + final RelDataType avgInputType = + getFieldType( + oldAggRel.getInput(), + iAvgInput); + final AggregateCall sumCall = + AggregateCall.create( + new HiveSqlSumAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.SUM, + oldCall.isDistinct(), + oldCall.isApproximate(), + oldCall.getArgList(), + oldCall.filterArg, + oldAggRel.getGroupCount(), + oldAggRel.getInput(), + null, + null); + final AggregateCall countCall = + AggregateCall.create( + new HiveSqlCountAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.COUNT, + oldCall.isDistinct(), + oldCall.isApproximate(), + oldCall.getArgList(), + oldCall.filterArg, + oldAggRel.getGroupCount(), + oldAggRel.getInput(), + null, + null); + + // NOTE: these references are with respect to the output + // of newAggRel + RexNode numeratorRef = + rexBuilder.addAggCall(sumCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(avgInputType)); + final RexNode denominatorRef = + rexBuilder.addAggCall(countCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(avgInputType)); + + final RelDataTypeFactory typeFactory = oldAggRel.getCluster().getTypeFactory(); + final RelDataType avgType = typeFactory.createTypeWithNullability( + oldCall.getType(), numeratorRef.getType().isNullable()); + numeratorRef = rexBuilder.ensureType(avgType, numeratorRef, true); + final RexNode divideRef = + rexBuilder.makeCall(SqlStdOperatorTable.DIVIDE, numeratorRef, denominatorRef); + return rexBuilder.makeCast(oldCall.getType(), divideRef); + } + + private RexNode reduceSum( + Aggregate oldAggRel, + AggregateCall oldCall, + List newCalls, + Map aggCallMapping) { + final int nGroups = oldAggRel.getGroupCount(); + RexBuilder rexBuilder = oldAggRel.getCluster().getRexBuilder(); + int arg = oldCall.getArgList().get(0); + RelDataType argType = + getFieldType( + oldAggRel.getInput(), + arg); + final AggregateCall sumZeroCall = + AggregateCall.create( + new HiveSqlSumEmptyIsZeroAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.SUM0, + oldCall.isDistinct(), + oldCall.isApproximate(), + oldCall.getArgList(), + oldCall.filterArg, + oldAggRel.getGroupCount(), + oldAggRel.getInput(), + null, + oldCall.name); + final AggregateCall countCall = + AggregateCall.create( + new HiveSqlCountAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.COUNT, + oldCall.isDistinct(), + oldCall.isApproximate(), + oldCall.getArgList(), + oldCall.filterArg, + oldAggRel.getGroupCount(), + oldAggRel, + null, + null); + + // NOTE: these references are with respect to the output + // of newAggRel + RexNode sumZeroRef = + rexBuilder.addAggCall(sumZeroCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(argType)); + if (!oldCall.getType().isNullable()) { + // If SUM(x) is not nullable, the validator must have determined that + // nulls are impossible (because the group is never empty and x is never + // null). Therefore we translate to SUM0(x). + return sumZeroRef; + } + RexNode countRef = + rexBuilder.addAggCall(countCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(argType)); + return rexBuilder.makeCall(SqlStdOperatorTable.CASE, + rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, + countRef, rexBuilder.makeExactLiteral(BigDecimal.ZERO)), + rexBuilder.makeCast(sumZeroRef.getType(), rexBuilder.constantNull()), + sumZeroRef); + } + + private RexNode reduceStddev( + Aggregate oldAggRel, + AggregateCall oldCall, + boolean biased, + boolean sqrt, + List newCalls, + Map aggCallMapping, + List inputExprs) { + // stddev_pop(x) ==> + // power( + // (sum(x * x) - sum(x) * sum(x) / count(x)) + // / count(x), + // .5) + // + // stddev_samp(x) ==> + // power( + // (sum(x * x) - sum(x) * sum(x) / count(x)) + // / nullif(count(x) - 1, 0), + // .5) + final int nGroups = oldAggRel.getGroupCount(); + final RelOptCluster cluster = oldAggRel.getCluster(); + final RexBuilder rexBuilder = cluster.getRexBuilder(); + final RelDataTypeFactory typeFactory = cluster.getTypeFactory(); + + assert oldCall.getArgList().size() == 1 : oldCall.getArgList(); + final int argOrdinal = oldCall.getArgList().get(0); + final RelDataType argOrdinalType = getFieldType(oldAggRel.getInput(), argOrdinal); + final RelDataType oldCallType = + typeFactory.createTypeWithNullability(oldCall.getType(), + argOrdinalType.isNullable()); + + final RexNode argRef = + rexBuilder.ensureType(oldCallType, inputExprs.get(argOrdinal), true); + final int argRefOrdinal = lookupOrAdd(inputExprs, argRef); + + final RexNode argSquared = rexBuilder.makeCall(SqlStdOperatorTable.MULTIPLY, + argRef, argRef); + final int argSquaredOrdinal = lookupOrAdd(inputExprs, argSquared); + + final AggregateCall sumArgSquaredAggCall = + createAggregateCallWithBinding(typeFactory, + new HiveSqlSumAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.SUM, + argSquared.getType(), oldAggRel, oldCall, argSquaredOrdinal); + + final RexNode sumArgSquared = + rexBuilder.addAggCall(sumArgSquaredAggCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(sumArgSquaredAggCall.getType())); + + final AggregateCall sumArgAggCall = + AggregateCall.create( + new HiveSqlSumAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.SUM, + oldCall.isDistinct(), + oldCall.isApproximate(), + ImmutableIntList.of(argOrdinal), + oldCall.filterArg, + oldAggRel.getGroupCount(), + oldAggRel.getInput(), + null, + null); + + final RexNode sumArg = + rexBuilder.addAggCall(sumArgAggCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(sumArgAggCall.getType())); + final RexNode sumArgCast = rexBuilder.ensureType(oldCallType, sumArg, true); + final RexNode sumSquaredArg = + rexBuilder.makeCall( + SqlStdOperatorTable.MULTIPLY, sumArgCast, sumArgCast); + + final AggregateCall countArgAggCall = + AggregateCall.create( + new HiveSqlCountAggFunction( + oldCall.isDistinct(), + oldCall.getAggregation().getReturnTypeInference(), + oldCall.getAggregation().getOperandTypeInference(), + oldCall.getAggregation().getOperandTypeChecker()), //SqlStdOperatorTable.COUNT, + oldCall.isDistinct(), + oldCall.isApproximate(), + oldCall.getArgList(), + oldCall.filterArg, + oldAggRel.getGroupCount(), + oldAggRel, + null, + null); + + final RexNode countArg = + rexBuilder.addAggCall(countArgAggCall, + nGroups, + oldAggRel.indicator, + newCalls, + aggCallMapping, + ImmutableList.of(argOrdinalType)); + + final RexNode avgSumSquaredArg = + rexBuilder.makeCall( + SqlStdOperatorTable.DIVIDE, sumSquaredArg, countArg); + + final RexNode diff = + rexBuilder.makeCall( + SqlStdOperatorTable.MINUS, + sumArgSquared, avgSumSquaredArg); + + final RexNode denominator; + if (biased) { + denominator = countArg; + } else { + final RexLiteral one = + rexBuilder.makeExactLiteral(BigDecimal.ONE); + final RexNode nul = + rexBuilder.makeCast(countArg.getType(), rexBuilder.constantNull()); + final RexNode countMinusOne = + rexBuilder.makeCall( + SqlStdOperatorTable.MINUS, countArg, one); + final RexNode countEqOne = + rexBuilder.makeCall( + SqlStdOperatorTable.EQUALS, countArg, one); + denominator = + rexBuilder.makeCall( + SqlStdOperatorTable.CASE, + countEqOne, nul, countMinusOne); + } + + final RexNode div = + rexBuilder.makeCall( + SqlStdOperatorTable.DIVIDE, diff, denominator); + + RexNode result = div; + if (sqrt) { + final RexNode half = + rexBuilder.makeExactLiteral(new BigDecimal("0.5")); + result = + rexBuilder.makeCall( + SqlStdOperatorTable.POWER, div, half); + } + + return rexBuilder.makeCast( + oldCall.getType(), result); + } + + /** + * Finds the ordinal of an element in a list, or adds it. + * + * @param list List + * @param element Element to lookup or add + * @param Element type + * @return Ordinal of element in list + */ + private static int lookupOrAdd(List list, T element) { + int ordinal = list.indexOf(element); + if (ordinal == -1) { + ordinal = list.size(); + list.add(element); + } + return ordinal; + } + + /** + * Do a shallow clone of oldAggRel and update aggCalls. Could be refactored + * into Aggregate and subclasses - but it's only needed for some + * subclasses. + * + * @param relBuilder Builder of relational expressions; at the top of its + * stack is its input + * @param oldAggregate LogicalAggregate to clone. + * @param newCalls New list of AggregateCalls + */ + protected void newAggregateRel(RelBuilder relBuilder, + Aggregate oldAggregate, List newCalls) { + relBuilder.aggregate( + relBuilder.groupKey(oldAggregate.getGroupSet(), + oldAggregate.getGroupSets()), + newCalls); + } + + private RelDataType getFieldType(RelNode relNode, int i) { + final RelDataTypeField inputField = + relNode.getRowType().getFieldList().get(i); + return inputField.getType(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java index cb0c2b1b35..8a9e66a53d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java @@ -49,6 +49,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlCountAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlMinMaxAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumAggFunction; +import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlVarianceAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveBetween; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveConcat; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExtractDate; @@ -608,7 +609,42 @@ public static SqlAggFunction getCalciteAggFn(String hiveUdfName, boolean isDisti udfInfo.returnTypeInference, udfInfo.operandTypeInference, udfInfo.operandTypeChecker); - break; + break; + case "std": + case "stddev": + case "stddev_pop": + calciteAggFn = new HiveSqlVarianceAggFunction( + "stddev_pop", + SqlKind.STDDEV_POP, + udfInfo.returnTypeInference, + udfInfo.operandTypeInference, + udfInfo.operandTypeChecker); + break; + case "stddev_samp": + calciteAggFn = new HiveSqlVarianceAggFunction( + "stddev_samp", + SqlKind.STDDEV_SAMP, + udfInfo.returnTypeInference, + udfInfo.operandTypeInference, + udfInfo.operandTypeChecker); + break; + case "variance": + case "var_pop": + calciteAggFn = new HiveSqlVarianceAggFunction( + "var_pop", + SqlKind.VAR_POP, + udfInfo.returnTypeInference, + udfInfo.operandTypeInference, + udfInfo.operandTypeChecker); + break; + case "var_samp": + calciteAggFn = new HiveSqlVarianceAggFunction( + "var_samp", + SqlKind.VAR_SAMP, + udfInfo.returnTypeInference, + udfInfo.operandTypeInference, + udfInfo.operandTypeChecker); + break; default: calciteAggFn = new CalciteUDAF( isDistinct, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 3520d90fa8..d90dde992b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -177,6 +177,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateJoinTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateProjectMergeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregatePullUpConstantsRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateReduceFunctionsRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateReduceRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveDruidRules; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveExceptRewriteRule; @@ -1821,6 +1822,7 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv rules.add(HiveReduceExpressionsRule.PROJECT_INSTANCE); rules.add(HiveReduceExpressionsRule.FILTER_INSTANCE); rules.add(HiveReduceExpressionsRule.JOIN_INSTANCE); + rules.add(HiveAggregateReduceFunctionsRule.INSTANCE); rules.add(HiveAggregateReduceRule.INSTANCE); if (conf.getBoolVar(HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZER)) { rules.add(new HivePointLookupOptimizerRule.FilterCondition(minNumORClauses)); @@ -1839,7 +1841,7 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv rules.toArray(new RelOptRule[rules.size()])); perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Prejoin ordering transformation, PPD, not null predicates, transitive inference, constant folding"); -// it is happening at 1762 + // 4. Push down limit through outer join // NOTE: We run this after PPD to support old style join syntax. // Ex: select * from R1 left outer join R2 where ((R1.x=R2.x) and R1.y<10) or diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSumEmptyIsZero.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSumEmptyIsZero.java index 01c933c4a1..8db24331d6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSumEmptyIsZero.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSumEmptyIsZero.java @@ -41,16 +41,30 @@ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + parameters[0].getTypeName() + " is passed."); } switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) { - case LONG: - return new SumZeroIfEmpty(); - default: - throw new UDFArgumentTypeException(0, - "Only bigint type arguments are accepted but " - + parameters[0].getTypeName() + " is passed."); + case BYTE: + case SHORT: + case INT: + case LONG: + return new SumLongZeroIfEmpty(); + case TIMESTAMP: + case FLOAT: + case DOUBLE: + case STRING: + case VARCHAR: + case CHAR: + return new SumDoubleZeroIfEmpty(); + case DECIMAL: + return new SumHiveDecimalZeroIfEmpty(); + case BOOLEAN: + case DATE: + default: + throw new UDFArgumentTypeException(0, + "Only numeric or string type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); } } - public static class SumZeroIfEmpty extends GenericUDAFSumLong { + public static class SumLongZeroIfEmpty extends GenericUDAFSumLong { @Override public Object terminate(AggregationBuffer agg) throws HiveException { @@ -59,5 +73,25 @@ public Object terminate(AggregationBuffer agg) throws HiveException { return result; } } + + public static class SumDoubleZeroIfEmpty extends GenericUDAFSumDouble { + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + SumDoubleAgg myagg = (SumDoubleAgg) agg; + result.set(myagg.sum); + return result; + } + } + + public static class SumHiveDecimalZeroIfEmpty extends GenericUDAFSumHiveDecimal { + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + SumHiveDecimalWritableAgg myagg = (SumHiveDecimalWritableAgg) agg; + result.set(myagg.sum); + return result; + } + } } diff --git a/ql/src/test/queries/clientpositive/udaf_binarysetfunctions_no_cbo.q b/ql/src/test/queries/clientpositive/udaf_binarysetfunctions_no_cbo.q new file mode 100644 index 0000000000..ae4733f705 --- /dev/null +++ b/ql/src/test/queries/clientpositive/udaf_binarysetfunctions_no_cbo.q @@ -0,0 +1,60 @@ +set hive.cbo.enable=false; + +drop table t; +create table t (id int,px int,y decimal,x decimal); + +insert into t values (101,1,1,1); +insert into t values (201,2,1,1); +insert into t values (301,3,1,1); +insert into t values (401,4,1,11); +insert into t values (501,5,1,null); +insert into t values (601,6,null,1); +insert into t values (701,6,null,null); +insert into t values (102,1,2,2); +insert into t values (202,2,1,2); +insert into t values (302,3,2,1); +insert into t values (402,4,2,12); +insert into t values (502,5,2,null); +insert into t values (602,6,null,2); +insert into t values (702,6,null,null); +insert into t values (103,1,3,3); +insert into t values (203,2,1,3); +insert into t values (303,3,3,1); +insert into t values (403,4,3,13); +insert into t values (503,5,3,null); +insert into t values (603,6,null,3); +insert into t values (703,6,null,null); +insert into t values (104,1,4,4); +insert into t values (204,2,1,4); +insert into t values (304,3,4,1); +insert into t values (404,4,4,14); +insert into t values (504,5,4,null); +insert into t values (604,6,null,4); +insert into t values (704,6,null,null); +insert into t values (800,7,1,1); + + +explain select px,var_pop(x),var_pop(y),corr(y,x),covar_samp(y,x),covar_pop(y,x),regr_count(y,x),regr_slope(y,x), +regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x), regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x) + from t group by px order by px; + +select px, + round( var_pop(x),5), + round( var_pop(y),5), + round( corr(y,x),5), + round( covar_samp(y,x),5), + round( covar_pop(y,x),5), + regr_count(y,x), + round( regr_slope(y,x),5), + round( regr_intercept(y,x),5), + round( regr_r2(y,x),5), + round( regr_sxx(y,x),5), + round( regr_syy(y,x),5), + round( regr_sxy(y,x),5), + round( regr_avgx(y,x),5), + round( regr_avgy(y,x),5), + round( regr_count(y,x),5) + from t group by px order by px; + + +select id,regr_count(y,x) over (partition by px) from t order by id; diff --git a/ql/src/test/results/clientpositive/udaf_binarysetfunctions.q.out b/ql/src/test/results/clientpositive/udaf_binarysetfunctions.q.out index af058a44e4..72d1bdb5b1 100644 --- a/ql/src/test/results/clientpositive/udaf_binarysetfunctions.q.out +++ b/ql/src/test/results/clientpositive/udaf_binarysetfunctions.q.out @@ -379,34 +379,38 @@ STAGE PLANS: alias: t Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: px (type: int), y (type: decimal(10,0)), x (type: decimal(10,0)) - outputColumnNames: px, y, x + expressions: px (type: int), x (type: decimal(10,0)), y (type: decimal(10,0)), (UDFToDouble(x) * UDFToDouble(x)) (type: double), (UDFToDouble(y) * UDFToDouble(y)) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: var_pop(x), var_pop(y), corr(y, x), covar_samp(y, x), covar_pop(y, x), regr_count(y, x), regr_slope(y, x), regr_intercept(y, x), regr_r2(y, x), regr_sxx(y, x), regr_syy(y, x), regr_sxy(y, x), regr_avgx(y, x), regr_avgy(y, x) - keys: px (type: int) + aggregations: count(_col3), $sum0(_col3), count(_col1), $sum0(_col1), count(_col4), $sum0(_col4), count(_col2), $sum0(_col2), corr(_col2, _col1), covar_samp(_col2, _col1), covar_pop(_col2, _col1), regr_count(_col2, _col1), regr_slope(_col2, _col1), regr_intercept(_col2, _col1), regr_r2(_col2, _col1), regr_sxx(_col2, _col1), regr_syy(_col2, _col1), regr_sxy(_col2, _col1), regr_avgx(_col2, _col1), regr_avgy(_col2, _col1) + keys: _col0 (type: int) mode: hash - outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: bigint), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct), _col10 (type: struct), _col11 (type: struct), _col12 (type: struct), _col13 (type: struct), _col14 (type: struct) + value expressions: _col1 (type: bigint), _col2 (type: double), _col3 (type: bigint), _col4 (type: decimal(20,0)), _col5 (type: bigint), _col6 (type: double), _col7 (type: bigint), _col8 (type: decimal(20,0)), _col9 (type: struct), _col10 (type: struct), _col11 (type: struct), _col12 (type: bigint), _col13 (type: struct), _col14 (type: struct), _col15 (type: struct), _col16 (type: struct), _col17 (type: struct), _col18 (type: struct), _col19 (type: struct), _col20 (type: struct) Reduce Operator Tree: Group By Operator - aggregations: var_pop(VALUE._col0), var_pop(VALUE._col1), corr(VALUE._col2), covar_samp(VALUE._col3), covar_pop(VALUE._col4), regr_count(VALUE._col5), regr_slope(VALUE._col6), regr_intercept(VALUE._col7), regr_r2(VALUE._col8), regr_sxx(VALUE._col9), regr_syy(VALUE._col10), regr_sxy(VALUE._col11), regr_avgx(VALUE._col12), regr_avgy(VALUE._col13) + aggregations: count(VALUE._col0), $sum0(VALUE._col1), count(VALUE._col2), $sum0(VALUE._col3), count(VALUE._col4), $sum0(VALUE._col5), count(VALUE._col6), $sum0(VALUE._col7), corr(VALUE._col8), covar_samp(VALUE._col9), covar_pop(VALUE._col10), regr_count(VALUE._col11), regr_slope(VALUE._col12), regr_intercept(VALUE._col13), regr_r2(VALUE._col14), regr_sxx(VALUE._col15), regr_syy(VALUE._col16), regr_sxy(VALUE._col17), regr_avgx(VALUE._col18), regr_avgy(VALUE._col19) keys: KEY._col0 (type: int) mode: mergepartial - outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 Statistics: Num rows: 14 Data size: 135 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Select Operator + expressions: _col0 (type: int), ((CASE WHEN ((_col1 = 0L)) THEN (null) ELSE (_col2) END - ((CASE WHEN ((_col3 = 0L)) THEN (null) ELSE (_col4) END * CASE WHEN ((_col3 = 0L)) THEN (null) ELSE (_col4) END) / _col3)) / _col3) (type: double), ((CASE WHEN ((_col5 = 0L)) THEN (null) ELSE (_col6) END - ((CASE WHEN ((_col7 = 0L)) THEN (null) ELSE (_col8) END * CASE WHEN ((_col7 = 0L)) THEN (null) ELSE (_col8) END) / _col7)) / _col7) (type: double), _col9 (type: double), _col10 (type: double), _col11 (type: double), _col12 (type: bigint), _col13 (type: double), _col14 (type: double), _col15 (type: double), _col16 (type: double), _col17 (type: double), _col18 (type: double), _col19 (type: decimal(14,4)), _col20 (type: decimal(14,4)) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 + Statistics: Num rows: 14 Data size: 135 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Stage: Stage-2 Map Reduce diff --git a/ql/src/test/results/clientpositive/udaf_binarysetfunctions_no_cbo.q.out b/ql/src/test/results/clientpositive/udaf_binarysetfunctions_no_cbo.q.out new file mode 100644 index 0000000000..6185693ade --- /dev/null +++ b/ql/src/test/results/clientpositive/udaf_binarysetfunctions_no_cbo.q.out @@ -0,0 +1,522 @@ +PREHOOK: query: drop table t +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table t +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table t (id int,px int,y decimal,x decimal) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t +POSTHOOK: query: create table t (id int,px int,y decimal,x decimal) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t +PREHOOK: query: insert into t values (101,1,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (101,1,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (201,2,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (201,2,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (301,3,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (301,3,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (401,4,1,11) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (401,4,1,11) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (501,5,1,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (501,5,1,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (601,6,null,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (601,6,null,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (701,6,null,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (701,6,null,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (102,1,2,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (102,1,2,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (202,2,1,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (202,2,1,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (302,3,2,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (302,3,2,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (402,4,2,12) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (402,4,2,12) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (502,5,2,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (502,5,2,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (602,6,null,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (602,6,null,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (702,6,null,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (702,6,null,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (103,1,3,3) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (103,1,3,3) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (203,2,1,3) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (203,2,1,3) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (303,3,3,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (303,3,3,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (403,4,3,13) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (403,4,3,13) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (503,5,3,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (503,5,3,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (603,6,null,3) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (603,6,null,3) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (703,6,null,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (703,6,null,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (104,1,4,4) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (104,1,4,4) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (204,2,1,4) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (204,2,1,4) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (304,3,4,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (304,3,4,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (404,4,4,14) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (404,4,4,14) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (504,5,4,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (504,5,4,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (604,6,null,4) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (604,6,null,4) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (704,6,null,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (704,6,null,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: insert into t values (800,7,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (800,7,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.id SCRIPT [] +POSTHOOK: Lineage: t.px SCRIPT [] +POSTHOOK: Lineage: t.x SCRIPT [] +POSTHOOK: Lineage: t.y SCRIPT [] +PREHOOK: query: explain select px,var_pop(x),var_pop(y),corr(y,x),covar_samp(y,x),covar_pop(y,x),regr_count(y,x),regr_slope(y,x), +regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x), regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x) + from t group by px order by px +PREHOOK: type: QUERY +POSTHOOK: query: explain select px,var_pop(x),var_pop(y),corr(y,x),covar_samp(y,x),covar_pop(y,x),regr_count(y,x),regr_slope(y,x), +regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x), regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x) + from t group by px order by px +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: px (type: int), y (type: decimal(10,0)), x (type: decimal(10,0)) + outputColumnNames: px, y, x + Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: var_pop(x), var_pop(y), corr(y, x), covar_samp(y, x), covar_pop(y, x), regr_count(y, x), regr_slope(y, x), regr_intercept(y, x), regr_r2(y, x), regr_sxx(y, x), regr_syy(y, x), regr_sxy(y, x), regr_avgx(y, x), regr_avgy(y, x) + keys: px (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 + Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 29 Data size: 281 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: bigint), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct), _col10 (type: struct), _col11 (type: struct), _col12 (type: struct), _col13 (type: struct), _col14 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: var_pop(VALUE._col0), var_pop(VALUE._col1), corr(VALUE._col2), covar_samp(VALUE._col3), covar_pop(VALUE._col4), regr_count(VALUE._col5), regr_slope(VALUE._col6), regr_intercept(VALUE._col7), regr_r2(VALUE._col8), regr_sxx(VALUE._col9), regr_syy(VALUE._col10), regr_sxy(VALUE._col11), regr_avgx(VALUE._col12), regr_avgy(VALUE._col13) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 + Statistics: Num rows: 14 Data size: 135 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Statistics: Num rows: 14 Data size: 135 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double), _col2 (type: double), _col3 (type: double), _col4 (type: double), _col5 (type: double), _col6 (type: bigint), _col7 (type: double), _col8 (type: double), _col9 (type: double), _col10 (type: double), _col11 (type: double), _col12 (type: double), _col13 (type: decimal(14,4)), _col14 (type: decimal(14,4)) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double), VALUE._col1 (type: double), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: double), VALUE._col5 (type: bigint), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: double), VALUE._col9 (type: double), VALUE._col10 (type: double), VALUE._col11 (type: double), VALUE._col12 (type: decimal(14,4)), VALUE._col13 (type: decimal(14,4)), VALUE._col5 (type: bigint) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15 + Statistics: Num rows: 14 Data size: 135 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 14 Data size: 135 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select px, + round( var_pop(x),5), + round( var_pop(y),5), + round( corr(y,x),5), + round( covar_samp(y,x),5), + round( covar_pop(y,x),5), + regr_count(y,x), + round( regr_slope(y,x),5), + round( regr_intercept(y,x),5), + round( regr_r2(y,x),5), + round( regr_sxx(y,x),5), + round( regr_syy(y,x),5), + round( regr_sxy(y,x),5), + round( regr_avgx(y,x),5), + round( regr_avgy(y,x),5), + round( regr_count(y,x),5) + from t group by px order by px +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select px, + round( var_pop(x),5), + round( var_pop(y),5), + round( corr(y,x),5), + round( covar_samp(y,x),5), + round( covar_pop(y,x),5), + regr_count(y,x), + round( regr_slope(y,x),5), + round( regr_intercept(y,x),5), + round( regr_r2(y,x),5), + round( regr_sxx(y,x),5), + round( regr_syy(y,x),5), + round( regr_sxy(y,x),5), + round( regr_avgx(y,x),5), + round( regr_avgy(y,x),5), + round( regr_count(y,x),5) + from t group by px order by px +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +1 1.25 1.25 1.0 1.66667 1.25 4 1.0 0.0 1.0 5.0 5.0 5.0 2.50000 2.50000 4 +2 1.25 0.0 NULL 0.0 0.0 4 0.0 1.0 1.0 5.0 0.0 0.0 2.50000 1.00000 4 +3 0.0 1.25 NULL 0.0 0.0 4 NULL NULL NULL 0.0 5.0 0.0 1.00000 2.50000 4 +4 1.25 1.25 1.0 1.66667 1.25 4 1.0 -10.0 1.0 5.0 5.0 5.0 12.50000 2.50000 4 +5 NULL 1.25 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL 0 +6 1.25 NULL NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL 0 +7 0.0 0.0 NULL NULL 0.0 1 NULL NULL NULL 0.0 0.0 0.0 1.00000 1.00000 1 +PREHOOK: query: select id,regr_count(y,x) over (partition by px) from t order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select id,regr_count(y,x) over (partition by px) from t order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +101 4 +102 4 +103 4 +104 4 +201 4 +202 4 +203 4 +204 4 +301 4 +302 4 +303 4 +304 4 +401 4 +402 4 +403 4 +404 4 +501 0 +502 0 +503 0 +504 0 +601 0 +602 0 +603 0 +604 0 +701 0 +702 0 +703 0 +704 0 +800 1