diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index e3ddbf1..8c838fa 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2465,6 +2465,12 @@ "If the number of references to a CTE clause exceeds this threshold, Hive will materialize it\n" + "before executing the main query block. -1 will disable this feature."), + HIVE_OPTIMIZE_REWRITE_COUNTDISTINCT_ENABLED("hive.optimize.sketches.rewrite.countdistintct.enabled", false, + "Enables to rewrite COUNT(DISTINCT(X)) queries to be rewritten to use sketch functions."), + + HIVE_OPTIMIZE_REWRITE_COUNT_DISTINCT_SKETCHCLASS("hive.optimize.sketches.rewrite.countdistintct.sketchclass", "hll", + "When rewriting COUNT(DISTINCT(X)) expressions use the given sketch class."), + // Statistics HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true, "Estimate statistics in absence of statistics."), diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index c55f8db..7ffd04d 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -820,6 +820,7 @@ schq_ingest.q,\ sketches_hll.q,\ sketches_theta.q,\ + sketches_rewrite.q,\ sketches_materialized_view_rollup.q,\ table_access_keys_stats.q,\ temp_table_llap_partitioned.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java index eec90c6..ebe9f48 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java @@ -21,18 +21,19 @@ import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.calcite.rel.type.RelDataTypeImpl; import org.apache.calcite.rel.type.RelProtoDataType; import org.apache.calcite.sql.SqlFunction; +import org.apache.calcite.sql.SqlFunctionCategory; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.type.InferTypes; import org.apache.calcite.sql.type.OperandTypes; import org.apache.calcite.sql.type.ReturnTypes; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveMergeableAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSqlFunction; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver2; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hive.plugin.api.HiveUDFPlugin; @@ -48,9 +49,9 @@ private static final String DATASKETCHES_PREFIX = "ds"; - private static final String DATA_TO_SKETCH = "sketch"; + public static final String DATA_TO_SKETCH = "sketch"; + public static final String SKETCH_TO_ESTIMATE = "estimate"; private static final String SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS = "estimate_bounds"; - private static final String SKETCH_TO_ESTIMATE = "estimate"; private static final String SKETCH_TO_STRING = "stringify"; private static final String UNION_SKETCH = "union"; private static final String UNION_SKETCH1 = "union_f"; @@ -73,12 +74,12 @@ private static final String SKETCH_TO_VARIANCES = "variances"; private static final String SKETCH_TO_PERCENTILE = "percentile"; - private final List sketchClasses; + private final Map sketchClasses; private final ArrayList descriptors; private DataSketchesFunctions() { - this.sketchClasses = new ArrayList(); - this.descriptors = new ArrayList(); + this.sketchClasses = new HashMap<>(); + this.descriptors = new ArrayList<>(); registerHll(); registerCpc(); registerKll(); @@ -96,19 +97,31 @@ return descriptors; } + public SketchFunctionDescriptor getSketchFunction(String className, String function) { + if (!sketchClasses.containsKey(className)) { + throw new IllegalArgumentException(String.format("Sketch-class '%s' doesn't exists", className)); + } + SketchDescriptor sc = sketchClasses.get(className); + if (!sc.fnMap.containsKey(function)) { + throw new IllegalArgumentException(String.format("The Sketch-class '%s' doesn't have a '%s' method", function)); + } + return sketchClasses.get(className).fnMap.get(function); + } + private void buildDescritors() { - for (SketchDescriptor sketchDescriptor : sketchClasses) { + for (SketchDescriptor sketchDescriptor : sketchClasses.values()) { descriptors.addAll(sketchDescriptor.fnMap.values()); } } private void buildCalciteFns() { - for (SketchDescriptor sd : sketchClasses) { + for (SketchDescriptor sd : sketchClasses.values()) { // Mergability is exposed to Calcite; which enables to use it during rollup. RelProtoDataType sketchType = RelDataTypeImpl.proto(SqlTypeName.BINARY, true); SketchFunctionDescriptor sketchSFD = sd.fnMap.get(DATA_TO_SKETCH); SketchFunctionDescriptor unionSFD = sd.fnMap.get(UNION_SKETCH); + SketchFunctionDescriptor estimateSFD = sd.fnMap.get(SKETCH_TO_ESTIMATE); if (sketchSFD == null || unionSFD == null) { continue; @@ -128,14 +141,26 @@ OperandTypes.family(), unionFn); + unionSFD.setCalciteFunction(unionFn); sketchSFD.setCalciteFunction(sketchFn); + if (estimateSFD != null) { + SqlFunction estimateFn = new HiveSqlFunction(estimateSFD.name, + SqlKind.OTHER_FUNCTION, + ReturnTypes.explicit(SqlTypeName.DOUBLE), + InferTypes.ANY_NULLABLE, + OperandTypes.family(), + SqlFunctionCategory.USER_DEFINED_FUNCTION, + true, + false); + + estimateSFD.setCalciteFunction(estimateFn); + } } } - private void registerHiveFunctionsInternal(Registry system) { - for (SketchDescriptor sketchDescriptor : sketchClasses) { + for (SketchDescriptor sketchDescriptor : sketchClasses.values()) { Collection functions = sketchDescriptor.fnMap.values(); for (SketchFunctionDescriptor fn : functions) { if (UDF.class.isAssignableFrom(fn.udfClass)) { @@ -189,6 +214,11 @@ public void setCalciteFunction(SqlFunction calciteFunction) { this.calciteFunction = calciteFunction; } + + @Override + public String toString() { + return getClass().getCanonicalName() + "[" + name + "]"; + } } private static class SketchDescriptor { @@ -214,7 +244,7 @@ sd.register(SKETCH_TO_STRING, org.apache.datasketches.hive.hll.SketchToStringUDF.class); sd.register(UNION_SKETCH1, org.apache.datasketches.hive.hll.UnionSketchUDF.class); sd.register(UNION_SKETCH, org.apache.datasketches.hive.hll.UnionSketchUDAF.class); - sketchClasses.add(sd); + sketchClasses.put("hll", sd); } private void registerCpc() { @@ -228,7 +258,7 @@ sd.register(SKETCH_TO_STRING, org.apache.datasketches.hive.cpc.SketchToStringUDF.class); sd.register(UNION_SKETCH1, org.apache.datasketches.hive.cpc.UnionSketchUDF.class); sd.register(UNION_SKETCH, org.apache.datasketches.hive.cpc.UnionSketchUDAF.class); - sketchClasses.add(sd); + sketchClasses.put("cpc", sd); } private void registerKll() { @@ -244,7 +274,7 @@ sd.register(GET_QUANTILES, org.apache.datasketches.hive.kll.GetQuantilesUDF.class); sd.register(GET_QUANTILE, org.apache.datasketches.hive.kll.GetQuantileUDF.class); sd.register(GET_RANK, org.apache.datasketches.hive.kll.GetRankUDF.class); - sketchClasses.add(sd); + sketchClasses.put("kll", sd); } private void registerTheta() { @@ -258,7 +288,7 @@ sd.register(INTERSECT_SKETCH, org.apache.datasketches.hive.theta.IntersectSketchUDAF.class); sd.register(SKETCH_TO_ESTIMATE, org.apache.datasketches.hive.theta.EstimateSketchUDF.class); sd.register(EXCLUDE_SKETCH, org.apache.datasketches.hive.theta.ExcludeSketchUDF.class); - sketchClasses.add(sd); + sketchClasses.put("theta", sd); } @@ -284,7 +314,7 @@ org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToQuantilesSketchUDF.class); sd.register(SKETCH_TO_VALUES, org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToValuesUDTF.class); sd.register(SKETCH_TO_VARIANCES, org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToVariancesUDF.class); - sketchClasses.add(sd); + sketchClasses.put("tuple_arrayofdouble", sd); } private void registerTupleDoubleSummary() { @@ -294,7 +324,7 @@ sd.register(UNION_SKETCH, org.apache.datasketches.hive.tuple.UnionDoubleSummarySketchUDAF.class); sd.register(SKETCH_TO_ESTIMATE, org.apache.datasketches.hive.tuple.DoubleSummarySketchToEstimatesUDF.class); sd.register(SKETCH_TO_PERCENTILE, org.apache.datasketches.hive.tuple.DoubleSummarySketchToPercentileUDF.class); - sketchClasses.add(sd); + sketchClasses.put("tuple_doublesummary", sd); } private void registerQuantiles() { @@ -312,7 +342,7 @@ sd.register(UNION_SKETCH, org.apache.datasketches.hive.frequencies.UnionStringsSketchUDAF.class); sd.register(GET_FREQUENT_ITEMS, org.apache.datasketches.hive.frequencies.GetFrequentItemsFromStringsSketchUDTF.class); - sketchClasses.add(sd); + sketchClasses.put("freq", sd); } private void registerQuantilesString() { @@ -327,7 +357,7 @@ sd.register(GET_PMF, org.apache.datasketches.hive.quantiles.GetPmfFromStringsSketchUDF.class); sd.register(GET_QUANTILE, org.apache.datasketches.hive.quantiles.GetQuantileFromStringsSketchUDF.class); sd.register(GET_QUANTILES, org.apache.datasketches.hive.quantiles.GetQuantilesFromStringsSketchUDF.class); - sketchClasses.add(sd); + sketchClasses.put("quantile_strings", sd); } private void registerQuantilesDoubles() { @@ -342,7 +372,7 @@ sd.register(GET_PMF, org.apache.datasketches.hive.quantiles.GetPmfFromDoublesSketchUDF.class); sd.register(GET_QUANTILE, org.apache.datasketches.hive.quantiles.GetQuantileFromDoublesSketchUDF.class); sd.register(GET_QUANTILES, org.apache.datasketches.hive.quantiles.GetQuantilesFromDoublesSketchUDF.class); - sketchClasses.add(sd); + sketchClasses.put("quantile_doubles", sd); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java new file mode 100644 index 0000000..6c90f77 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Aggregate; +import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.core.RelFactories.AggregateFactory; +import org.apache.calcite.rel.core.RelFactories.ProjectFactory; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlAggFunction; +import org.apache.calcite.sql.SqlOperator; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hive.plugin.api.HiveUDFPlugin.UDFDescriptor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ImmutableList; + +/** + * This rule could rewrite {@code count(distinct(x))} calls to be calculated using sketch based functions. + */ +public final class HiveRewriteCountDistinctToDataSketches extends RelOptRule { + + protected static final Logger LOG = LoggerFactory.getLogger(HiveRewriteCountDistinctToDataSketches.class); + private String sketchClass; + + public HiveRewriteCountDistinctToDataSketches(HiveConf conf) { + super(operand(HiveAggregate.class, any())); + sketchClass = conf.getVar(ConfVars.HIVE_OPTIMIZE_REWRITE_COUNT_DISTINCT_SKETCHCLASS); + } + + @Override + public void onMatch(RelOptRuleCall call) { + final Aggregate aggregate = call.rel(0); + + if (aggregate.getGroupSets().size() != 1) { + // not yet supported + return; + } + + List newAggCalls = new ArrayList(); + + AggregateFactory f = HiveRelFactories.HIVE_AGGREGATE_FACTORY; + + VBuilder vb = new VBuilder(aggregate); + + ProjectFactory projectFactory = HiveRelFactories.HIVE_PROJECT_FACTORY; + + if (aggregate.getAggCallList().equals(vb.newAggCalls)) { + // rule didn't made any changes + return; + } + + newAggCalls = vb.newAggCalls; + // FIXME HiveAggregate? + RelNode newAgg = aggregate.copy(aggregate.getTraitSet(), aggregate.getInput(), aggregate.getGroupSet(), + aggregate.getGroupSets(), newAggCalls); + + RelNode newProject = projectFactory.createProject(newAgg, vb.newProjects, aggregate.getRowType().getFieldNames()); + + call.transformTo(newProject); + return; + } + + /** + * Helper class to help in building a new Aggregate and Project. + */ + // NOTE: methods in this class are not re-entrant; drop-to-frame to constructor during debugging + class VBuilder { + + private Aggregate aggregate; + private List newAggCalls; + private List newProjects; + private final RexBuilder rexBuilder; + + public VBuilder(Aggregate aggregate) { + this.aggregate = aggregate; + newAggCalls = new ArrayList(); + newProjects = new ArrayList(); + rexBuilder = aggregate.getCluster().getRexBuilder(); + + // add non-aggregated fields - as identity projections + addGroupFields(); + + for (AggregateCall aggCall : aggregate.getAggCallList()) { + processAggCall(aggCall); + } + } + + private void addGroupFields() { + for (int i = 0; i < aggregate.getGroupCount(); i++) { + newProjects.add(rexBuilder.makeInputRef(aggregate, 0)); + } + } + + private void processAggCall(AggregateCall aggCall) { + if (isSimpleCountDistinct(aggCall)) { + rewriteCountDistinct(aggCall); + return; + } + appendAggCall(aggCall, null); + } + + private void appendAggCall(AggregateCall aggCall, SqlOperator projectOperator) { + RelDataType origType = aggregate.getRowType().getFieldList().get(newProjects.size()).getType(); + RexNode projRex = rexBuilder.makeInputRef(aggCall.getType(), newProjects.size()); + if (projectOperator != null) { + projRex = rexBuilder.makeCall(projectOperator, ImmutableList.of(projRex)); + projRex = rexBuilder.makeCast(origType, projRex); + } + newAggCalls.add(aggCall); + newProjects.add(projRex); + } + + private boolean isSimpleCountDistinct(AggregateCall aggCall) { + return aggCall.isDistinct() && aggCall.getArgList().size() == 1 + && aggCall.getAggregation().getName().equalsIgnoreCase("count") && !aggCall.hasFilter(); + } + + private void rewriteCountDistinct(AggregateCall aggCall) { + + SqlAggFunction aggFunction = (SqlAggFunction) getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH); + boolean distinct = false; + boolean approximate = true; + boolean ignoreNulls = aggCall.ignoreNulls(); + List argList = aggCall.getArgList(); + int filterArg = aggCall.filterArg; + RelCollation collation = aggCall.getCollation(); + int groupCount = aggregate.getGroupCount(); + RelNode input = aggregate.getInput(); + RelDataType type = rexBuilder.deriveReturnType(aggFunction, Collections.emptyList()); + String name = aggFunction.getName(); + + AggregateCall ret = AggregateCall.create(aggFunction, distinct, approximate, ignoreNulls, argList, filterArg, + collation, groupCount, input, type, name); + + appendAggCall(ret, getSqlOperator(DataSketchesFunctions.SKETCH_TO_ESTIMATE)); + } + + private SqlOperator getSqlOperator(String fnName) { + UDFDescriptor fn = DataSketchesFunctions.INSTANCE.getSketchFunction(sketchClass, fnName); + if (!fn.getCalciteFunction().isPresent()) { + throw new RuntimeException(fn.toString() + " doesn't have a Calcite function associated with it"); + } + return fn.getCalciteFunction().get(); + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 7b2e201..fce68f9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -236,6 +236,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelFieldTrimmer; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRemoveGBYSemiJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRemoveSqCountCheck; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRewriteCountDistinctToDataSketches; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule; @@ -1967,6 +1968,13 @@ generatePartialProgram(program, false, HepMatchOrder.DEPTH_FIRST, HiveExceptRewriteRule.INSTANCE); + // ? + if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_REWRITE_COUNTDISTINCT_ENABLED)) { + generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, + new HiveRewriteCountDistinctToDataSketches(conf)); + } + + //1. Distinct aggregate rewrite // Run this optimization early, since it is expanding the operator pipeline. if (!conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr") && @@ -2278,7 +2286,7 @@ RelMetadataQuery.THREAD_PROVIDERS.set(JaninoRelMetadataProvider.DEFAULT); RelMetadataQuery mq = RelMetadataQuery.instance(); RelOptCost costOriginalPlan = mq.getCumulativeCost(calcitePreMVRewritingPlan); - final double factorSelectivity = (double) HiveConf.getFloatVar( + final double factorSelectivity = HiveConf.getFloatVar( conf, HiveConf.ConfVars.HIVE_MATERIALIZED_VIEW_REBUILD_INCREMENTAL_FACTOR); RelOptCost costRebuildPlan = mq.getCumulativeCost(basePlan).multiplyBy(factorSelectivity); if (costOriginalPlan.isLe(costRebuildPlan)) { diff --git ql/src/test/queries/clientpositive/sketches_rewrite.q ql/src/test/queries/clientpositive/sketches_rewrite.q new file mode 100644 index 0000000..6e72658 --- /dev/null +++ ql/src/test/queries/clientpositive/sketches_rewrite.q @@ -0,0 +1,24 @@ + +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.strict.checks.cartesian.product=false; +set hive.stats.fetch.column.stats=true; +set hive.materializedview.rewriting=true; +set hive.fetch.task.conversion=none; +set hive.optimize.sketches.rewrite.countdistintct.enabled=true; + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +-- see if we use the mv +explain +select category, count(distinct id) from sketch_input group by category; + +select category, count(distinct id) from sketch_input group by category; + diff --git ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out new file mode 100644 index 0000000..dedcff9 --- /dev/null +++ ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out @@ -0,0 +1,110 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: explain +select category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 1958 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)) + outputColumnNames: id, category + Statistics: Num rows: 22 Data size: 1958 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_sketch(id) + keys: category (type: char(1)) + minReductionHashAggr: 0.9090909 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: struct) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_hll_sketch(VALUE._col0) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: char(1)), UDFToLong(ds_hll_estimate(_col1)) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +a 10 +b 10