diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 8094d28..52bf480 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2492,19 +2492,22 @@ HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED("hive.optimize.bi.rewrite.countdistinct.enabled", true, "Enables to rewrite COUNT(DISTINCT(X)) queries to be rewritten to use sketch functions."), - HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH( - "hive.optimize.bi.rewrite.countdistinct.sketch", "hll", + HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH("hive.optimize.bi.rewrite.countdistinct.sketch", "hll", new StringSet("hll"), "Defines which sketch type to use when rewriting COUNT(DISTINCT(X)) expressions. " + "Distinct counting can be done with: hll"), HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_ENABLED("hive.optimize.bi.rewrite.percentile_disc.enabled", true, "Enables to rewrite PERCENTILE_DISC(X) queries to be rewritten to use sketch functions."), - HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH( - "hive.optimize.bi.rewrite.percentile_disc.sketch", "kll", + HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH("hive.optimize.bi.rewrite.percentile_disc.sketch", "kll", new StringSet("kll"), "Defines which sketch type to use when rewriting PERCENTILE_DISC expressions. Options: kll"), - + HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_ENABLED("hive.optimize.bi.rewrite.cume_dist.enabled", + true, + "Enables to rewrite CUME_DIST(X) queries to be rewritten to use sketch functions."), + HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_SKETCH("hive.optimize.bi.rewrite.cume_dist.sketch", "kll", + new StringSet("kll"), + "Defines which sketch type to use when rewriting CUME_DIST expressions. Options: kll"), // Statistics HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true, diff --git data/conf/hive-log4j2.properties data/conf/hive-log4j2.properties index e5bb166..d2153b7 100644 --- data/conf/hive-log4j2.properties +++ data/conf/hive-log4j2.properties @@ -119,7 +119,7 @@ logger.ObjectStore.level = INFO logger.CalcitePlanner.name = org.apache.calcite.plan.RelOptPlanner -logger.CalcitePlanner.level = INFO +logger.CalcitePlanner.level = TRACE logger.AmazonAws.name=com.amazonaws logger.AmazonAws.level = INFO diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java index cc48d5b..a16572d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java @@ -19,6 +19,8 @@ package org.apache.hadoop.hive.ql.exec; import java.lang.reflect.Method; +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -61,8 +63,8 @@ private static final String SKETCH_TO_STRING = "stringify"; private static final String UNION_SKETCH = "union"; private static final String UNION_SKETCH1 = "union_f"; - private static final String GET_N = "n"; - private static final String GET_CDF = "cdf"; + public static final String GET_N = "n"; + public static final String GET_CDF = "cdf"; private static final String GET_PMF = "pmf"; private static final String GET_QUANTILES = "quantiles"; public static final String GET_QUANTILE = "quantile"; @@ -123,13 +125,17 @@ private void buildCalciteFns() { for (SketchDescriptor sd : sketchClasses.values()) { + + registerAsHiveFunction(sd.fnMap.get(SKETCH_TO_ESTIMATE)); + registerAsHiveFunction(sd.fnMap.get(GET_QUANTILE)); + registerAsHiveFunction(sd.fnMap.get(GET_CDF)); + registerAsHiveFunction(sd.fnMap.get(GET_N)); + // Mergability is exposed to Calcite; which enables to use it during rollup. RelProtoDataType sketchType = RelDataTypeImpl.proto(SqlTypeName.BINARY, true); SketchFunctionDescriptor sketchSFD = sd.fnMap.get(DATA_TO_SKETCH); SketchFunctionDescriptor unionSFD = sd.fnMap.get(UNION_SKETCH); - SketchFunctionDescriptor estimateSFD = sd.fnMap.get(SKETCH_TO_ESTIMATE); - SketchFunctionDescriptor quantileSFD = sd.fnMap.get(GET_QUANTILE); if (sketchSFD == null || unionSFD == null) { continue; @@ -152,33 +158,24 @@ unionSFD.setCalciteFunction(unionFn); sketchSFD.setCalciteFunction(sketchFn); - if (estimateSFD != null && estimateSFD.getReturnRelDataType().isPresent()) { - SqlFunction estimateFn = new HiveSqlFunction(estimateSFD.name, - SqlKind.OTHER_FUNCTION, - ReturnTypes.explicit(estimateSFD.getReturnRelDataType().get().getSqlTypeName()), - InferTypes.ANY_NULLABLE, - OperandTypes.family(), - SqlFunctionCategory.USER_DEFINED_FUNCTION, - true, - false); - estimateSFD.setCalciteFunction(estimateFn); - } + } + } - if (quantileSFD != null && quantileSFD.getReturnRelDataType().isPresent()) { - SqlFunction quantileFn = new HiveSqlFunction(quantileSFD.name, - SqlKind.OTHER_FUNCTION, - ReturnTypes.explicit(quantileSFD.getReturnRelDataType().get().getSqlTypeName()), - InferTypes.ANY_NULLABLE, - OperandTypes.family(), - SqlFunctionCategory.USER_DEFINED_FUNCTION, - true, - false); + private void registerAsHiveFunction(SketchFunctionDescriptor sfd) { + if (sfd != null && sfd.getReturnRelDataType().isPresent()) { + SqlFunction cdfFn = + new HiveSqlFunction(sfd.name, + SqlKind.OTHER_FUNCTION, + ReturnTypes.explicit(sfd.getReturnRelDataType().get()), + InferTypes.ANY_NULLABLE, + OperandTypes.family(), + SqlFunctionCategory.USER_DEFINED_FUNCTION, + true, + false); - quantileSFD.setCalciteFunction(quantileFn); - - } + sfd.setCalciteFunction(cdfFn); } } @@ -209,11 +206,11 @@ } - private static class SketchFunctionDescriptor implements HiveUDFPlugin.UDFDescriptor { + static class SketchFunctionDescriptor implements HiveUDFPlugin.UDFDescriptor { String name; Class udfClass; private SqlFunction calciteFunction; - private Class returnType; + private Type returnType; public SketchFunctionDescriptor(String name, Class udfClass) { this.name = name; @@ -235,12 +232,21 @@ return Optional.empty(); } else { JavaTypeFactoryImpl typeFactory = new JavaTypeFactoryImpl(new HiveTypeSystemImpl()); + Type type = returnType; + if (type instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType) type; + if (parameterizedType.getRawType() == List.class) { + final RelDataType componentRelType = typeFactory.createType(parameterizedType.getActualTypeArguments()[0]); + return Optional + .of(typeFactory.createArrayType(typeFactory.createTypeWithNullability(componentRelType, true), -1)); + } + } return Optional.of(typeFactory.createType(returnType)); } } - public void setReturnType(Class returnType) { - this.returnType = returnType; + public void setReturnType(Type type) { + this.returnType = type; } @Override @@ -272,7 +278,7 @@ if (UDF.class.isAssignableFrom(clazz)) { Optional evaluateMethod = getEvaluateMethod(clazz); if (evaluateMethod.isPresent()) { - value.setReturnType(evaluateMethod.get().getReturnType()); + value.setReturnType(evaluateMethod.get().getGenericReturnType()); } } fnMap.put(name, value); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java index 184a026..9e98013 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java @@ -42,6 +42,8 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumEmptyIsZeroAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFloorDate; +import com.google.common.collect.ImmutableList; + /** * Builder for relational expressions in Hive. @@ -158,6 +160,12 @@ } @Override + public AggCall aggregateCall(SqlAggFunction aggFunction, boolean distinct, boolean approximate, boolean ignoreNulls, + RexNode filter, ImmutableList orderKeys, String alias, ImmutableList operands) { + return super.aggregateCall(aggFunction, distinct, approximate, ignoreNulls, filter, orderKeys, alias, operands); + } + + @Override protected boolean shouldMergeProject() { /* CALCITE-2470 added ability to merge Project-s together. * The problem with it is that it may merge 2 windowing expressions. diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java index 0123137..73c406a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java @@ -24,25 +24,33 @@ import org.apache.calcite.plan.RelOptRuleCall; import org.apache.calcite.plan.RelOptRuleOperand; import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelCollations; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Aggregate; import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.RelFactories.ProjectFactory; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; +import org.apache.calcite.rex.RexWindow; import org.apache.calcite.sql.SqlAggFunction; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.SqlOperator; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.tools.RelBuilder; +import org.apache.calcite.util.ImmutableBitSet; import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter; import org.apache.hive.plugin.api.HiveUDFPlugin.UDFDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -204,12 +212,11 @@ return fn.getCalciteFunction().get(); } - abstract void rewrite(AggregateCall aggCall); - abstract boolean isApplicable(AggregateCall aggCall); - } + abstract void rewrite(AggregateCall aggCall); + } }; public static class CountDistinctRewrite extends AggregateToProjectAggregateProject { @@ -368,4 +375,195 @@ } } } + + /** + * Generic support for rewriting Windowing expression into a different form usually using joins. + */ + private static abstract class WindowingToProjectAggregateJoinProject extends RelOptRule { + + protected final String sketchType; + + public WindowingToProjectAggregateJoinProject(String sketchType) { + super(operand(HiveProject.class, any())); + this.sketchType = sketchType; + } + + @Override + public void onMatch(RelOptRuleCall call) { + + final Project project = call.rel(0); + + VbuilderPAP vb = buildProcessor(call); + RelNode newProject = vb.processProject(project); + + if (newProject instanceof Project && ((Project) newProject).getChildExps().equals(project.getChildExps())) { + return; + } else { + call.transformTo(newProject); + } + } + + protected abstract VbuilderPAP buildProcessor(RelOptRuleCall call); + + + protected static abstract class VbuilderPAP { + private final String sketchClass; + protected final RelBuilder relBuilder; + protected final RexBuilder rexBuilder; + + protected VbuilderPAP(String sketchClass, RelBuilder relBuilder) { + this.sketchClass = sketchClass; + this.relBuilder = relBuilder; + rexBuilder = relBuilder.getRexBuilder(); + } + + protected RelNode processProject(Project project) { + relBuilder.push(project.getInput()); + // FIXME later use shuttle + List newProjects = new ArrayList(); + for (RexNode expr : project.getChildExps()) { + newProjects.add(processCall(expr)); + } + relBuilder.project(newProjects); + return relBuilder.build(); + } + + private final RexNode processCall(RexNode expr) { + if (expr instanceof RexOver) { + RexOver over = (RexOver) expr; + if (isApplicable(over)) { + return rewrite(over); + } + } + return expr; + } + + protected final SqlOperator getSqlOperator(String fnName) { + UDFDescriptor fn = DataSketchesFunctions.INSTANCE.getSketchFunction(sketchClass, fnName); + if (!fn.getCalciteFunction().isPresent()) { + throw new RuntimeException(fn.toString() + " doesn't have a Calcite function associated with it"); + } + return fn.getCalciteFunction().get(); + } + + abstract RexNode rewrite(RexOver expr); + + abstract boolean isApplicable(RexOver expr); + + } + + } + + public static class CumeDistRewrite extends WindowingToProjectAggregateJoinProject { + + public CumeDistRewrite(String sketchType) { + super(sketchType); + } + + @Override + protected VbuilderPAP buildProcessor(RelOptRuleCall call) { + return new VB(sketchType, call.builder()); + } + + private static class VB extends VbuilderPAP { + + protected VB(String sketchClass, RelBuilder relBuilder) { + super(sketchClass, relBuilder); + } + + @Override + boolean isApplicable(RexOver over) { + // FIXME PARTITION BY + SqlAggFunction aggOp = over.getAggOperator(); + RexWindow window = over.getWindow(); + if (aggOp.getName().equalsIgnoreCase("cume_dist") && window.orderKeys.size() == 1 + && window.getLowerBound().isUnbounded() && window.getUpperBound().isUnbounded() + && window.partitionKeys.size() == 1 && window.partitionKeys.get(0).isA(SqlKind.LITERAL)) { + return true; + } + return false; + } + + @Override + RexNode rewrite(RexOver over) { + + over.getOperands(); + RexWindow w = over.getWindow(); + // FIXME: NULLs first/last collation stuff + // we don't really support nulls in aggregate/etc...they are actually ignored + // so some hack will be needed for NULLs anyway.. + RexNode orderKey = w.orderKeys.get(0).getKey(); + + relBuilder.push(relBuilder.peek()); + RexNode castedKey = rexBuilder.makeCast(getFloatType(), orderKey); + relBuilder.project(castedKey,rexBuilder.makeLiteral(true)); + + SqlAggFunction aggFunction = (SqlAggFunction) getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH); + boolean distinct = false; + boolean approximate = true; + boolean ignoreNulls = true; + List argList = Lists.newArrayList(0); + int filterArg = -1; + RelCollation collation = RelCollations.EMPTY; + RelDataType type = rexBuilder.deriveReturnType(aggFunction, Collections.emptyList()); + String name = aggFunction.getName(); + AggregateCall newAgg = AggregateCall.create(aggFunction, distinct, approximate, ignoreNulls, argList, filterArg, + collation, type, name); + + // relBuilder.aggregate(groupKey, aggCalls) + RelNode agg = HiveRelFactories.HIVE_AGGREGATE_FACTORY.createAggregate( + relBuilder.build(), + ImmutableBitSet.of(1), ImmutableList.of(ImmutableBitSet.of(1)), + Lists.newArrayList(newAgg)); + relBuilder.push(agg); + + relBuilder.join(JoinRelType.INNER); + + // long story short: CAST(CDF(X-(0.5/N))[0] AS FLOAT) + RexInputRef sketchInputRef = relBuilder.field(relBuilder.peek().getRowType().getFieldCount() - 1); + SqlOperator projectOperator = getSqlOperator(DataSketchesFunctions.GET_CDF); + SqlOperator getN = getSqlOperator(DataSketchesFunctions.GET_N); + RexNode projRex = rexBuilder.makeCall(SqlStdOperatorTable.DIVIDE, + relBuilder.literal(.5), + rexBuilder.makeCall(getN, sketchInputRef) + ); + projRex = rexBuilder.makeCall(SqlStdOperatorTable.MINUS, castedKey, projRex); + projRex = rexBuilder.makeCast(getFloatType(), projRex); + projRex = rexBuilder.makeCall(projectOperator, ImmutableList.of(sketchInputRef, projRex)); + projRex = getItemOperator(projRex, relBuilder.literal(0)); + projRex = rexBuilder.makeCast(over.getType(), projRex); + + return projRex; + } + + private RexNode getItemOperator(RexNode arr, RexNode offset) { + + if(getClass().desiredAssertionStatus()) { + try { + SqlKind.class.getField("ITEM"); + throw new RuntimeException("bind SqlKind.ITEM instead of this crap - C1.23 a02155a70a"); + } catch(NoSuchFieldException e) { + // ignore + } + } + + try { + SqlOperator indexFn = SqlFunctionConverter.getCalciteFn("index", + ImmutableList.of(arr.getType(),offset.getType()), + arr.getType().getComponentType(), true, false); + RexNode call = rexBuilder.makeCall(indexFn, arr, offset); + return call; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private RelDataType getFloatType() { + RelDataTypeFactory typeFactory = rexBuilder.getTypeFactory(); + RelDataType notNullFloatType = typeFactory.createSqlType(SqlTypeName.FLOAT); + RelDataType floatType = typeFactory.createTypeWithNullability(notNullFloatType, true); + return floatType; + } + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java index 9819f4a..eef7ddb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java @@ -672,6 +672,15 @@ udfInfo.operandTypeInference, udfInfo.operandTypeChecker); break; + case "cume_dist": + int asd = 1; + // calciteAggFn = new HiveSqlVarianceAggFunction( + // "cume_dist", + // SqlKind.CUME_DIST, + // udfInfo.returnTypeInference, + // udfInfo.operandTypeInference, + // udfInfo.operandTypeChecker); + // break; default: calciteAggFn = new CalciteUDAF( isDistinct, diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 377e828..2396641 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -1974,13 +1974,18 @@ if (!isMaterializedViewMaintenance() && conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_ENABLED)) { // Rewrite to datasketches if enabled if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED)) { - String countDistinctSketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH); - RelOptRule rule = new HiveRewriteToDataSketchesRules.CountDistinctRewrite(countDistinctSketchType); + String sketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH); + RelOptRule rule = new HiveRewriteToDataSketchesRules.CountDistinctRewrite(sketchType); generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule); } if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_ENABLED)) { - String percentileDiscSketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH); - RelOptRule rule = new HiveRewriteToDataSketchesRules.PercentileDiscRewrite(percentileDiscSketchType); + String sketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH); + RelOptRule rule = new HiveRewriteToDataSketchesRules.PercentileDiscRewrite(sketchType); + generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule); + } + if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_ENABLED)) { + String sketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_SKETCH); + RelOptRule rule = new HiveRewriteToDataSketchesRules.CumeDistRewrite(sketchType); generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java index 07813b9..c91eb31 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java @@ -41,7 +41,6 @@ import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRexExecutorImpl; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExtractDate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFloorDate; @@ -389,7 +388,7 @@ if (FunctionRegistry.isRankingFunction(aggregateName)) { // Rank functions type is 'int'/'double' - if (aggregateName.equalsIgnoreCase("percent_rank")) { + if (aggregateName.equalsIgnoreCase("percent_rank") || aggregateName.equalsIgnoreCase("cume_dist")) { returnType = TypeInfoFactory.doubleTypeInfo; } else { returnType = TypeInfoFactory.intTypeInfo; diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/TestDataSketchesFunctions.java ql/src/test/org/apache/hadoop/hive/ql/exec/TestDataSketchesFunctions.java new file mode 100644 index 0000000..d446105 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/TestDataSketchesFunctions.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import static org.junit.Assert.assertTrue; + +import java.util.Optional; + +import org.apache.calcite.rel.type.RelDataType; +import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions.SketchFunctionDescriptor; +import org.junit.Test; + +public final class TestDataSketchesFunctions { + + @Test + public void testKllGetCdfReturnType() { + SketchFunctionDescriptor cf = + DataSketchesFunctions.INSTANCE.getSketchFunction("kll", DataSketchesFunctions.GET_CDF); + Optional retType = cf.getReturnRelDataType(); + assertTrue(retType.get().getComponentType() != null); + } +} diff --git ql/src/test/queries/clientpositive/sketches_materialized_view_cume_dist.q ql/src/test/queries/clientpositive/sketches_materialized_view_cume_dist.q new file mode 100644 index 0000000..af32aaa --- /dev/null +++ ql/src/test/queries/clientpositive/sketches_materialized_view_cume_dist.q @@ -0,0 +1,54 @@ +--! qt:transactional +set hive.fetch.task.conversion=none; + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +-- create an mv for the intermediate results +create materialized view mv_1 as + select ds_kll_sketch(cast(id as float)) from sketch_input; + +-- bi mode on +set hive.optimize.bi.enabled=true; + +explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +set hive.optimize.bi.enabled=false; + +explain +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +set hive.optimize.bi.enabled=true; + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +explain +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +explain +alter materialized view mv_1 rebuild; +alter materialized view mv_1 rebuild; + +explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +-- rewrite+mv matching with rollup +explain +select 'FIXME rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'FIXME rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +drop materialized view mv_1; diff --git ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist.q ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist.q new file mode 100644 index 0000000..4125805 --- /dev/null +++ ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist.q @@ -0,0 +1,27 @@ +--! qt:transactional + + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +select id,cume_dist() over (order by id) from sketch_input; + +set hive.optimize.bi.enabled=true; + +select id,cume_dist() over (order by id),ds_kll_cdf(ds, CAST(id AS FLOAT) - 0.5/ds_kll_n(ds))[0] +from sketch_input +join ( select ds_kll_sketch(cast(id as float)) as ds from sketch_input ) q +order by id; + +-- see if rewrite happens +explain +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id; + +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id; + diff --git ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out new file mode 100644 index 0000000..5fb302b --- /dev/null +++ ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out @@ -0,0 +1,1052 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: create materialized view mv_1 as + select ds_kll_sketch(cast(id as float)) from sketch_input +PREHOOK: type: CREATE_MATERIALIZED_VIEW +PREHOOK: Input: default@sketch_input +PREHOOK: Output: database:default +PREHOOK: Output: default@mv_1 +POSTHOOK: query: create materialized view mv_1 as + select ds_kll_sketch(cast(id as float)) from sketch_input +POSTHOOK: type: CREATE_MATERIALIZED_VIEW +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: database:default +POSTHOOK: Output: default@mv_1 +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(id) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 1 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c0 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.0 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 22 Data size: 3520 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), ds_kll_cdf(_col2, UDFToFloat((_col1 - _col3)))[0] (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2552 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2552 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary), (0.5D / ds_kll_n(_col1)) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary), _col1 (type: double) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching 1 0.0 +rewrite; mv matching 1 0.0 +rewrite; mv matching 2 0.09090909090909091 +rewrite; mv matching 3 0.13636363636363635 +rewrite; mv matching 4 0.18181818181818182 +rewrite; mv matching 5 0.22727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 7 0.4090909090909091 +rewrite; mv matching 7 0.4090909090909091 +rewrite; mv matching 8 0.5 +rewrite; mv matching 8 0.5 +rewrite; mv matching 9 0.5909090909090909 +rewrite; mv matching 9 0.5909090909090909 +rewrite; mv matching 10 0.6818181818181818 +rewrite; mv matching 10 0.6818181818181818 +rewrite; mv matching 11 0.7727272727272727 +rewrite; mv matching 12 0.8181818181818182 +rewrite; mv matching 13 0.8636363636363636 +rewrite; mv matching 14 0.9090909090909091 +rewrite; mv matching 15 0.9545454545454546 +PREHOOK: query: explain +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: 0 (type: int), id (type: int) + null sort order: az + sort order: ++ + Map-reduce partition columns: 0 (type: int) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 5984 Basic stats: COMPLETE Column stats: COMPLETE + PTF Operator + Function definitions: + Input definition + input alias: ptf_0 + output shape: _col0: int + type: WINDOWING + Windowing table definition + input alias: ptf_1 + name: windowingtablefunction + order by: _col0 ASC NULLS LAST + partition by: 0 + raw input shape: + window functions: + window function definition + alias: cume_dist_window_0 + arguments: _col0 + name: cume_dist + window function: GenericUDAFCumeDistEvaluator + window frame: ROWS PRECEDING(MAX)~FOLLOWING(MAX) + isPivotResult: true + Statistics: Num rows: 22 Data size: 5984 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), cume_dist_window_0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'no rewrite; no mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2684 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2684 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +no rewrite; no mv matching 1 0.09090909090909091 +no rewrite; no mv matching 1 0.09090909090909091 +no rewrite; no mv matching 2 0.13636363636363635 +no rewrite; no mv matching 3 0.18181818181818182 +no rewrite; no mv matching 4 0.22727272727272727 +no rewrite; no mv matching 5 0.2727272727272727 +no rewrite; no mv matching 6 0.4090909090909091 +no rewrite; no mv matching 6 0.4090909090909091 +no rewrite; no mv matching 6 0.4090909090909091 +no rewrite; no mv matching 7 0.5 +no rewrite; no mv matching 7 0.5 +no rewrite; no mv matching 8 0.5909090909090909 +no rewrite; no mv matching 8 0.5909090909090909 +no rewrite; no mv matching 9 0.6818181818181818 +no rewrite; no mv matching 9 0.6818181818181818 +no rewrite; no mv matching 10 0.7727272727272727 +no rewrite; no mv matching 10 0.7727272727272727 +no rewrite; no mv matching 11 0.8181818181818182 +no rewrite; no mv matching 12 0.8636363636363636 +no rewrite; no mv matching 13 0.9090909090909091 +no rewrite; no mv matching 14 0.9545454545454546 +no rewrite; no mv matching 15 1.0 +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(id) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat(id) (type: float) + outputColumnNames: _col0 + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.97727275 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 44 Data size: 7040 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), ds_kll_cdf(_col2, UDFToFloat((_col1 - _col3)))[0] (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; no mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 5236 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 44 Data size: 5236 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary), (0.5D / ds_kll_n(_col1)) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary), _col1 (type: double) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; no mv matching 1 0.0 +rewrite; no mv matching 1 0.0 +rewrite; no mv matching 1 0.0 +rewrite; no mv matching 1 0.0 +rewrite; no mv matching 2 0.09090909090909091 +rewrite; no mv matching 2 0.09090909090909091 +rewrite; no mv matching 3 0.13636363636363635 +rewrite; no mv matching 3 0.13636363636363635 +rewrite; no mv matching 4 0.18181818181818182 +rewrite; no mv matching 4 0.18181818181818182 +rewrite; no mv matching 5 0.22727272727272727 +rewrite; no mv matching 5 0.22727272727272727 +rewrite; no mv matching 6 0.2727272727272727 +rewrite; no mv matching 6 0.2727272727272727 +rewrite; no mv matching 6 0.2727272727272727 +rewrite; no mv matching 6 0.2727272727272727 +rewrite; no mv matching 6 0.2727272727272727 +rewrite; no mv matching 6 0.2727272727272727 +rewrite; no mv matching 7 0.4090909090909091 +rewrite; no mv matching 7 0.4090909090909091 +rewrite; no mv matching 7 0.4090909090909091 +rewrite; no mv matching 7 0.4090909090909091 +rewrite; no mv matching 8 0.5 +rewrite; no mv matching 8 0.5 +rewrite; no mv matching 8 0.5 +rewrite; no mv matching 8 0.5 +rewrite; no mv matching 9 0.5909090909090909 +rewrite; no mv matching 9 0.5909090909090909 +rewrite; no mv matching 9 0.5909090909090909 +rewrite; no mv matching 9 0.5909090909090909 +rewrite; no mv matching 10 0.6818181818181818 +rewrite; no mv matching 10 0.6818181818181818 +rewrite; no mv matching 10 0.6818181818181818 +rewrite; no mv matching 10 0.6818181818181818 +rewrite; no mv matching 11 0.7727272727272727 +rewrite; no mv matching 11 0.7727272727272727 +rewrite; no mv matching 12 0.8181818181818182 +rewrite; no mv matching 12 0.8181818181818182 +rewrite; no mv matching 13 0.8636363636363636 +rewrite; no mv matching 13 0.8636363636363636 +rewrite; no mv matching 14 0.9090909090909091 +rewrite; no mv matching 14 0.9090909090909091 +rewrite; no mv matching 15 0.9545454545454546 +rewrite; no mv matching 15 0.9545454545454546 +PREHOOK: query: explain +alter materialized view mv_1 rebuild +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@mv_1 +POSTHOOK: query: explain +alter materialized view mv_1 rebuild +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@mv_1 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 5 <- Union 3 (CONTAINS) + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Union 3 (CONTAINS) + Reducer 4 <- Union 3 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + filterExpr: (ROW__ID.writeid > 1L) (type: boolean) + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (ROW__ID.writeid > 1L) (type: boolean) + Statistics: Num rows: 14 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: UDFToFloat(id) (type: float) + outputColumnNames: _col0 + Statistics: Num rows: 14 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col0) + minReductionHashAggr: 0.9285714 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Map 5 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 1 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c0 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.mv_1 + Select Operator + expressions: _col0 (type: binary) + outputColumnNames: _c0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: compute_stats(_c0, 'hll') + mode: complete + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Union 3 + Vertex: Union 3 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.mv_1 + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: _c0 + Column Types: binary + Table: default.mv_1 + + Stage: Stage-4 + Materialized View Update + name: default.mv_1 + update creation metadata: true + +PREHOOK: query: alter materialized view mv_1 rebuild +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@mv_1 +POSTHOOK: query: alter materialized view mv_1 rebuild +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@mv_1 +POSTHOOK: Lineage: mv_1._c0 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), (mv_1)default.mv_1.FieldSchema(name:_c0, type:binary, comment:null), ] +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(id) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c0 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.0 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 44 Data size: 7040 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), ds_kll_cdf(_col2, UDFToFloat((_col1 - _col3)))[0] (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 5104 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 44 Data size: 5104 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary), (0.5D / ds_kll_n(_col1)) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary), _col1 (type: double) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching 1 0.0 +rewrite; mv matching 1 0.0 +rewrite; mv matching 1 0.0 +rewrite; mv matching 1 0.0 +rewrite; mv matching 2 0.09090909090909091 +rewrite; mv matching 2 0.09090909090909091 +rewrite; mv matching 3 0.13636363636363635 +rewrite; mv matching 3 0.13636363636363635 +rewrite; mv matching 4 0.18181818181818182 +rewrite; mv matching 4 0.18181818181818182 +rewrite; mv matching 5 0.22727272727272727 +rewrite; mv matching 5 0.22727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 6 0.2727272727272727 +rewrite; mv matching 7 0.4090909090909091 +rewrite; mv matching 7 0.4090909090909091 +rewrite; mv matching 7 0.4090909090909091 +rewrite; mv matching 7 0.4090909090909091 +rewrite; mv matching 8 0.5 +rewrite; mv matching 8 0.5 +rewrite; mv matching 8 0.5 +rewrite; mv matching 8 0.5 +rewrite; mv matching 9 0.5909090909090909 +rewrite; mv matching 9 0.5909090909090909 +rewrite; mv matching 9 0.5909090909090909 +rewrite; mv matching 9 0.5909090909090909 +rewrite; mv matching 10 0.6818181818181818 +rewrite; mv matching 10 0.6818181818181818 +rewrite; mv matching 10 0.6818181818181818 +rewrite; mv matching 10 0.6818181818181818 +rewrite; mv matching 11 0.7727272727272727 +rewrite; mv matching 11 0.7727272727272727 +rewrite; mv matching 12 0.8181818181818182 +rewrite; mv matching 12 0.8181818181818182 +rewrite; mv matching 13 0.8636363636363636 +rewrite; mv matching 13 0.8636363636363636 +rewrite; mv matching 14 0.9090909090909091 +rewrite; mv matching 14 0.9090909090909091 +rewrite; mv matching 15 0.9545454545454546 +rewrite; mv matching 15 0.9545454545454546 +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'FIXME rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'FIXME rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(id) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c0 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.0 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 44 Data size: 7040 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), ds_kll_cdf(_col2, UDFToFloat((_col1 - _col3)))[0] (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'FIXME rewrite; mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 5368 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 44 Data size: 5368 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary), (0.5D / ds_kll_n(_col1)) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary), _col1 (type: double) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'FIXME rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'FIXME rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +FIXME rewrite; mv matching 1 0.0 +FIXME rewrite; mv matching 1 0.0 +FIXME rewrite; mv matching 1 0.0 +FIXME rewrite; mv matching 1 0.0 +FIXME rewrite; mv matching 2 0.09090909090909091 +FIXME rewrite; mv matching 2 0.09090909090909091 +FIXME rewrite; mv matching 3 0.13636363636363635 +FIXME rewrite; mv matching 3 0.13636363636363635 +FIXME rewrite; mv matching 4 0.18181818181818182 +FIXME rewrite; mv matching 4 0.18181818181818182 +FIXME rewrite; mv matching 5 0.22727272727272727 +FIXME rewrite; mv matching 5 0.22727272727272727 +FIXME rewrite; mv matching 6 0.2727272727272727 +FIXME rewrite; mv matching 6 0.2727272727272727 +FIXME rewrite; mv matching 6 0.2727272727272727 +FIXME rewrite; mv matching 6 0.2727272727272727 +FIXME rewrite; mv matching 6 0.2727272727272727 +FIXME rewrite; mv matching 6 0.2727272727272727 +FIXME rewrite; mv matching 7 0.4090909090909091 +FIXME rewrite; mv matching 7 0.4090909090909091 +FIXME rewrite; mv matching 7 0.4090909090909091 +FIXME rewrite; mv matching 7 0.4090909090909091 +FIXME rewrite; mv matching 8 0.5 +FIXME rewrite; mv matching 8 0.5 +FIXME rewrite; mv matching 8 0.5 +FIXME rewrite; mv matching 8 0.5 +FIXME rewrite; mv matching 9 0.5909090909090909 +FIXME rewrite; mv matching 9 0.5909090909090909 +FIXME rewrite; mv matching 9 0.5909090909090909 +FIXME rewrite; mv matching 9 0.5909090909090909 +FIXME rewrite; mv matching 10 0.6818181818181818 +FIXME rewrite; mv matching 10 0.6818181818181818 +FIXME rewrite; mv matching 10 0.6818181818181818 +FIXME rewrite; mv matching 10 0.6818181818181818 +FIXME rewrite; mv matching 11 0.7727272727272727 +FIXME rewrite; mv matching 11 0.7727272727272727 +FIXME rewrite; mv matching 12 0.8181818181818182 +FIXME rewrite; mv matching 12 0.8181818181818182 +FIXME rewrite; mv matching 13 0.8636363636363636 +FIXME rewrite; mv matching 13 0.8636363636363636 +FIXME rewrite; mv matching 14 0.9090909090909091 +FIXME rewrite; mv matching 14 0.9090909090909091 +FIXME rewrite; mv matching 15 0.9545454545454546 +FIXME rewrite; mv matching 15 0.9545454545454546 +PREHOOK: query: drop materialized view mv_1 +PREHOOK: type: DROP_MATERIALIZED_VIEW +PREHOOK: Input: default@mv_1 +PREHOOK: Output: default@mv_1 +POSTHOOK: query: drop materialized view mv_1 +POSTHOOK: type: DROP_MATERIALIZED_VIEW +POSTHOOK: Input: default@mv_1 +POSTHOOK: Output: default@mv_1 diff --git ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out new file mode 100644 index 0000000..83a1c38 --- /dev/null +++ ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out @@ -0,0 +1,245 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: select id,cume_dist() over (order by id) from sketch_input +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,cume_dist() over (order by id) from sketch_input +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 0.09090909090909091 +1 0.09090909090909091 +2 0.13636363636363635 +3 0.18181818181818182 +4 0.22727272727272727 +5 0.2727272727272727 +6 0.4090909090909091 +6 0.4090909090909091 +6 0.4090909090909091 +7 0.5 +7 0.5 +8 0.5909090909090909 +8 0.5909090909090909 +9 0.6818181818181818 +9 0.6818181818181818 +10 0.7727272727272727 +10 0.7727272727272727 +11 0.8181818181818182 +12 0.8636363636363636 +13 0.9090909090909091 +14 0.9545454545454546 +15 1.0 +Warning: Shuffle Join MERGEJOIN[38][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +Warning: Shuffle Join MERGEJOIN[39][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 6' is a cross product +Warning: Shuffle Join MERGEJOIN[40][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 3' is a cross product +PREHOOK: query: select id,cume_dist() over (order by id),ds_kll_cdf(ds, CAST(id AS FLOAT) - 0.5/ds_kll_n(ds))[0] +from sketch_input +join ( select ds_kll_sketch(cast(id as float)) as ds from sketch_input ) q +order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,cume_dist() over (order by id),ds_kll_cdf(ds, CAST(id AS FLOAT) - 0.5/ds_kll_n(ds))[0] +from sketch_input +join ( select ds_kll_sketch(cast(id as float)) as ds from sketch_input ) q +order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 0.0 0.0 +1 0.0 0.0 +2 0.09090909090909091 0.09090909090909091 +3 0.13636363636363635 0.13636363636363635 +4 0.18181818181818182 0.18181818181818182 +5 0.22727272727272727 0.22727272727272727 +6 0.2727272727272727 0.2727272727272727 +6 0.2727272727272727 0.2727272727272727 +6 0.2727272727272727 0.2727272727272727 +7 0.4090909090909091 0.4090909090909091 +7 0.4090909090909091 0.4090909090909091 +8 0.5 0.5 +8 0.5 0.5 +9 0.5909090909090909 0.5909090909090909 +9 0.5909090909090909 0.5909090909090909 +10 0.6818181818181818 0.6818181818181818 +10 0.6818181818181818 0.6818181818181818 +11 0.7727272727272727 0.7727272727272727 +12 0.8181818181818182 0.8181818181818182 +13 0.8636363636363636 0.8636363636363636 +14 0.9090909090909091 0.9090909090909091 +15 0.9545454545454546 0.9545454545454546 +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(id) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat(id) (type: float) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.95454544 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 22 Data size: 3520 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), ds_kll_cdf(_col2, UDFToFloat((_col1 - _col3)))[0] (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), 'rewrite' (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2266 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2266 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary), (0.5D / ds_kll_n(_col1)) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary), _col1 (type: double) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 rewrite 0.0 +1 rewrite 0.0 +2 rewrite 0.09090909090909091 +3 rewrite 0.13636363636363635 +4 rewrite 0.18181818181818182 +5 rewrite 0.22727272727272727 +6 rewrite 0.2727272727272727 +6 rewrite 0.2727272727272727 +6 rewrite 0.2727272727272727 +7 rewrite 0.4090909090909091 +7 rewrite 0.4090909090909091 +8 rewrite 0.5 +8 rewrite 0.5 +9 rewrite 0.5909090909090909 +9 rewrite 0.5909090909090909 +10 rewrite 0.6818181818181818 +10 rewrite 0.6818181818181818 +11 rewrite 0.7727272727272727 +12 rewrite 0.8181818181818182 +13 rewrite 0.8636363636363636 +14 rewrite 0.9090909090909091 +15 rewrite 0.9545454545454546