diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 1c7905db31..dcd705668c 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1768,7 +1768,8 @@ spark.only.query.negative.files=spark_job_max_tasks.q,\ tez.perf.disabled.query.files=mv_query44.q,\ mv_query67.q -spark.perf.disabled.query.files=query14.q,\ +spark.perf.disabled.query.files=query1b.q,\ + query14.q,\ query64.q,\ cbo_query1.q,\ cbo_ext_query1.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java index 46a34b9aa7..4c27c5b8ab 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java @@ -58,6 +58,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc; @@ -73,6 +74,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFInBloomFilter; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -357,33 +359,63 @@ private static boolean sharedWorkOptimization(ParseContext pctx, SharedWorkOptim LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableTsOp, retainableTsOp); } else { + ExprNodeDesc newRetainableTsFilterExpr = null; + List semijoinExprNodes = new ArrayList<>(); if (retainableTsOp.getConf().getFilterExpr() != null) { + // Gather SJ expressions and normal expressions + List allExprNodesExceptSemijoin = new ArrayList<>(); + splitExpressions(retainableTsOp.getConf().getFilterExpr(), + allExprNodesExceptSemijoin, semijoinExprNodes); + // Create new expressions + if (allExprNodesExceptSemijoin.size() > 1) { + newRetainableTsFilterExpr = ExprNodeGenericFuncDesc.newInstance( + new GenericUDFOPAnd(), allExprNodesExceptSemijoin); + } else if (allExprNodesExceptSemijoin.size() > 0 && + allExprNodesExceptSemijoin.get(0) instanceof ExprNodeGenericFuncDesc) { + newRetainableTsFilterExpr = allExprNodesExceptSemijoin.get(0); + } // Push filter on top of children for retainable pushFilterToTopOfTableScan(optimizerCache, retainableTsOp); } + ExprNodeDesc newDiscardableTsFilterExpr = null; if (discardableTsOp.getConf().getFilterExpr() != null) { + // If there is a single discardable operator, it is a TableScanOperator + // and it means that we will merge filter expressions for it. Thus, we + // might need to remove DPP predicates before doing that + List allExprNodesExceptSemijoin = new ArrayList<>(); + splitExpressions(discardableTsOp.getConf().getFilterExpr(), + allExprNodesExceptSemijoin, new ArrayList<>()); + // Create new expressions + if (allExprNodesExceptSemijoin.size() > 1) { + newDiscardableTsFilterExpr = ExprNodeGenericFuncDesc.newInstance( + new GenericUDFOPAnd(), allExprNodesExceptSemijoin); + } else if (allExprNodesExceptSemijoin.size() > 0 && + allExprNodesExceptSemijoin.get(0) instanceof ExprNodeGenericFuncDesc) { + newDiscardableTsFilterExpr = allExprNodesExceptSemijoin.get(0); + } + // Remove and add semijoin filter from expressions + replaceSemijoinExpressions(discardableTsOp, semijoinExprNodes); // Push filter on top of children for discardable pushFilterToTopOfTableScan(optimizerCache, discardableTsOp); } // Obtain filter for shared TS operator ExprNodeGenericFuncDesc exprNode = null; - if (retainableTsOp.getConf().getFilterExpr() != null && discardableTsOp.getConf().getFilterExpr() != null) { + if (newRetainableTsFilterExpr != null && newDiscardableTsFilterExpr != null) { // Combine - exprNode = retainableTsOp.getConf().getFilterExpr(); - ExprNodeGenericFuncDesc tsExprNode = discardableTsOp.getConf().getFilterExpr(); - if (!exprNode.isSame(tsExprNode)) { + exprNode = (ExprNodeGenericFuncDesc) newRetainableTsFilterExpr; + if (!exprNode.isSame(newDiscardableTsFilterExpr)) { // We merge filters from previous scan by ORing with filters from current scan if (exprNode.getGenericUDF() instanceof GenericUDFOPOr) { List newChildren = new ArrayList<>(exprNode.getChildren().size() + 1); for (ExprNodeDesc childExprNode : exprNode.getChildren()) { - if (childExprNode.isSame(tsExprNode)) { + if (childExprNode.isSame(newDiscardableTsFilterExpr)) { // We do not need to do anything, it is in the OR expression break; } newChildren.add(childExprNode); } if (exprNode.getChildren().size() == newChildren.size()) { - newChildren.add(tsExprNode); + newChildren.add(newDiscardableTsFilterExpr); exprNode = ExprNodeGenericFuncDesc.newInstance( new GenericUDFOPOr(), newChildren); @@ -391,10 +423,22 @@ private static boolean sharedWorkOptimization(ParseContext pctx, SharedWorkOptim } else { exprNode = ExprNodeGenericFuncDesc.newInstance( new GenericUDFOPOr(), - Arrays.asList(exprNode, tsExprNode)); + Arrays.asList(exprNode, newDiscardableTsFilterExpr)); } } } + // Create expression node that will be used for the retainable table scan + if (!semijoinExprNodes.isEmpty()) { + if (exprNode != null) { + semijoinExprNodes.add(0, exprNode); + } + if (semijoinExprNodes.size() > 1) { + exprNode = ExprNodeGenericFuncDesc.newInstance( + new GenericUDFOPAnd(), semijoinExprNodes); + } else { + exprNode = (ExprNodeGenericFuncDesc) semijoinExprNodes.get(0); + } + } // Replace filter retainableTsOp.getConf().setFilterExpr(exprNode); // Replace table scan operator @@ -443,26 +487,6 @@ private static boolean sharedWorkOptimization(ParseContext pctx, SharedWorkOptim OperatorUtils.removeOperator(op); optimizerCache.removeOp(op); removedOps.add(op); - if (sr.discardableOps.size() == 1) { - // If there is a single discardable operator, it is a TableScanOperator - // and it means that we have merged filter expressions for it. Thus, we - // might need to remove DPP predicates from the retainable TableScanOperator - Collection> c = - optimizerCache.tableScanToDPPSource.get((TableScanOperator) op); - for (Operator dppSource : c) { - if (dppSource instanceof ReduceSinkOperator) { - GenTezUtils.removeSemiJoinOperator(pctx, - (ReduceSinkOperator) dppSource, - (TableScanOperator) sr.retainableOps.get(0)); - optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op); - } else if (dppSource instanceof AppMasterEventOperator) { - GenTezUtils.removeSemiJoinOperator(pctx, - (AppMasterEventOperator) dppSource, - (TableScanOperator) sr.retainableOps.get(0)); - optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op); - } - } - } LOG.debug("Operator removed: {}", op); } @@ -486,6 +510,75 @@ private static boolean sharedWorkOptimization(ParseContext pctx, SharedWorkOptim return mergedExecuted; } + private static void replaceSemijoinExpressions(TableScanOperator tsOp, List semijoinExprNodes) { + ExprNodeDesc constNode = new ExprNodeConstantDesc( + TypeInfoFactory.booleanTypeInfo, Boolean.TRUE); + // TS operator + if (tsOp.getConf().getFilterExpr() != null) { + ExprNodeDesc tsFilterExpr = tsOp.getConf().getFilterExpr(); + if (FunctionRegistry.isOpAnd(tsFilterExpr)) { + tsFilterExpr.getChildren().removeIf(SharedWorkOptimizer::isSemijoinExpr); + tsFilterExpr.getChildren().addAll(semijoinExprNodes); + if (tsFilterExpr.getChildren().isEmpty() || + (tsFilterExpr.getChildren().size() == 1 && !(tsFilterExpr.getChildren().get(0) instanceof ExprNodeGenericFuncDesc))) { + tsOp.getConf().setFilterExpr(null); + } + } + } + // Filter operators on top + if (tsOp.getChildOperators() != null) { + for (Operator op : tsOp.getChildOperators()) { + if (op instanceof FilterOperator) { + FilterOperator filterOp = (FilterOperator) op; + ExprNodeDesc filterExpr = filterOp.getConf().getPredicate(); + if (FunctionRegistry.isOpAnd(filterExpr)) { + filterExpr.getChildren().removeIf(SharedWorkOptimizer::isSemijoinExpr); + if (filterExpr.getChildren().isEmpty()) { + filterOp.getConf().setPredicate(constNode); + } else if (filterExpr.getChildren().size() == 1) { + filterOp.getConf().setPredicate(filterExpr.getChildren().get(0)); + } + } + } + } + } + } + + private static boolean isSemijoinExpr(ExprNodeDesc expr) { + if (expr instanceof ExprNodeDynamicListDesc) { + // DYNAMIC PARTITION PRUNING + return true; + } + if (FunctionRegistry.isOpBetween(expr) && + expr.getChildren().get(2) instanceof ExprNodeDynamicValueDesc) { + // BETWEEN in SJ + return true; + } + if (FunctionRegistry.isOpInBloomFilter(expr) && + expr.getChildren().get(1) instanceof ExprNodeDynamicValueDesc) { + // IN_BLOOM_FILTER in SJ + return true; + } + return false; + } + + private static void splitExpressions(ExprNodeDesc exprNode, + List allExprNodesExceptSemijoin, List semijoinExprNodes) { + if (FunctionRegistry.isOpAnd(exprNode)) { + for (ExprNodeDesc expr : exprNode.getChildren()) { + if (isSemijoinExpr(expr)) { + semijoinExprNodes.add(expr); + } else { + allExprNodesExceptSemijoin.add(expr); + } + } + } else if (isSemijoinExpr(exprNode)) { + semijoinExprNodes.add(exprNode); + } else { + allExprNodesExceptSemijoin.add(exprNode); + } + } + private static void sharedWorkExtendedOptimization(ParseContext pctx, SharedWorkOptimizerCache optimizerCache) throws SemanticException { // Gather RS operators that 1) belong to root works, i.e., works containing TS operators, @@ -1679,7 +1772,7 @@ private static void pushFilterToTopOfTableScan( } else { ExprNodeGenericFuncDesc newPred = ExprNodeGenericFuncDesc.newInstance( new GenericUDFOPAnd(), - Arrays.asList(tableScanExprNode.clone(), filterExprNode)); + Arrays.asList(tableScanExprNode.clone(), filterExprNode)); filterOp.getConf().setPredicate(newPred); } } else { diff --git a/ql/src/test/queries/clientpositive/perf/query1b.q b/ql/src/test/queries/clientpositive/perf/query1b.q new file mode 100644 index 0000000000..ef6745f777 --- /dev/null +++ b/ql/src/test/queries/clientpositive/perf/query1b.q @@ -0,0 +1,37 @@ +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +set hive.stats.fetch.column.stats=true; +set hive.auto.convert.join=true; +set hive.auto.convert.join.noconditionaltask=true; +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.partition.pruning.extended=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.tez.bigtable.minsize.semijoin.reduction=1; +set hive.tez.bloom.filter.factor=1.0f; +set hive.tez.min.bloom.filter.entries=1; +set hive.tez.max.bloom.filter.entries=10000000000; + + +explain +with customer_total_return as +(select sr_customer_sk as ctr_customer_sk +,sr_store_sk as ctr_store_sk +,sum(SR_FEE) as ctr_total_return +from store_returns +,date_dim +where sr_returned_date_sk = d_date_sk +and d_year =2000 +group by sr_customer_sk +,sr_store_sk) + select c_customer_id +from customer_total_return ctr1 +,store +,customer +where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 +from customer_total_return ctr2 +where ctr1.ctr_store_sk = ctr2.ctr_store_sk) +and s_store_sk = ctr1.ctr_store_sk +and s_state = 'NM' +and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100; diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out new file mode 100644 index 0000000000..126edcad0a --- /dev/null +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out @@ -0,0 +1,395 @@ +PREHOOK: query: explain +with customer_total_return as +(select sr_customer_sk as ctr_customer_sk +,sr_store_sk as ctr_store_sk +,sum(SR_FEE) as ctr_total_return +from store_returns +,date_dim +where sr_returned_date_sk = d_date_sk +and d_year =2000 +group by sr_customer_sk +,sr_store_sk) + select c_customer_id +from customer_total_return ctr1 +,store +,customer +where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 +from customer_total_return ctr2 +where ctr1.ctr_store_sk = ctr2.ctr_store_sk) +and s_store_sk = ctr1.ctr_store_sk +and s_state = 'NM' +and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@customer +PREHOOK: Input: default@date_dim +PREHOOK: Input: default@store +PREHOOK: Input: default@store_returns +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain +with customer_total_return as +(select sr_customer_sk as ctr_customer_sk +,sr_store_sk as ctr_store_sk +,sum(SR_FEE) as ctr_total_return +from store_returns +,date_dim +where sr_returned_date_sk = d_date_sk +and d_year =2000 +group by sr_customer_sk +,sr_store_sk) + select c_customer_id +from customer_total_return ctr1 +,store +,customer +where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 +from customer_total_return ctr2 +where ctr1.ctr_store_sk = ctr2.ctr_store_sk) +and s_store_sk = ctr1.ctr_store_sk +and s_state = 'NM' +and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer +POSTHOOK: Input: default@date_dim +POSTHOOK: Input: default@store +POSTHOOK: Input: default@store_returns +POSTHOOK: Output: hdfs://### HDFS PATH ### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 11 <- Reducer 7 (BROADCAST_EDGE) + Map 3 <- Map 10 (BROADCAST_EDGE), Map 9 (BROADCAST_EDGE), Reducer 2 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (BROADCAST_EDGE), Map 3 (SIMPLE_EDGE), Reducer 8 (BROADCAST_EDGE) + Reducer 5 <- Map 11 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) + Reducer 6 <- Reducer 5 (SIMPLE_EDGE) + Reducer 7 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 8 <- Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: store + filterExpr: (s_state = 'NM') (type: boolean) + Statistics: Num rows: 1704 Data size: 153360 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (s_state = 'NM') (type: boolean) + Statistics: Num rows: 35 Data size: 3150 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: s_store_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=35) + minReductionHashAggr: 0.9714286 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized + Map 10 + Map Operator Tree: + TableScan + alias: date_dim + filterExpr: (d_year = 2000) (type: boolean) + Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (d_year = 2000) (type: boolean) + Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_date_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Map 11 + Map Operator Tree: + TableScan + alias: customer + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (c_customer_sk BETWEEN DynamicValue(RS_47_store_returns_sr_customer_sk_min) AND DynamicValue(RS_47_store_returns_sr_customer_sk_max) and in_bloom_filter(c_customer_sk, DynamicValue(RS_47_store_returns_sr_customer_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: c_customer_sk (type: int), c_customer_id (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized + Map 3 + Map Operator Tree: + TableScan + alias: store_returns + filterExpr: (((sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null) or (sr_store_sk is not null and sr_returned_date_sk is not null)) and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 57591150 Data size: 6891360020 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2)) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1, _col2, _col3 + input vertices: + 1 Map 9 + Statistics: Num rows: 16855704 Data size: 1805298496 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Group By Operator + aggregations: sum(_col3) + keys: _col2 (type: int), _col1 (type: int) + minReductionHashAggr: 0.8699312 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: int) + Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: decimal(17,2)) + Filter Operator + predicate: (sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2)) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1, _col2, _col3 + input vertices: + 1 Map 10 + Statistics: Num rows: 17467258 Data size: 1870797840 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Group By Operator + aggregations: sum(_col3) + keys: _col2 (type: int), _col1 (type: int) + minReductionHashAggr: 0.85786486 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: decimal(17,2)) + Execution mode: vectorized + Map 9 + Map Operator Tree: + TableScan + alias: date_dim + filterExpr: (d_year = 2000) (type: boolean) + Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (d_year = 2000) (type: boolean) + Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_date_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=35) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 4 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: _col2 is not null (type: boolean) + Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: int), _col0 (type: int), _col2 (type: decimal(17,2)) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col1 (type: int) + outputColumnNames: _col1, _col2, _col3 + input vertices: + 0 Map 1 + Statistics: Num rows: 1923224 Data size: 220816368 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col2 (type: int) + 1 _col1 (type: int) + outputColumnNames: _col1, _col3, _col4 + input vertices: + 1 Reducer 8 + Statistics: Num rows: 1991910 Data size: 449166736 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Filter Operator + predicate: (_col3 > _col4) (type: boolean) + Statistics: Num rows: 663970 Data size: 149722248 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 663970 Data size: 149722248 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 663970 Data size: 1985936 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=68687) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 5 + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col7 + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col7 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Reducer 6 + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 100 + Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 7 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=68687) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 8 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), _col2 (type: decimal(17,2)) + outputColumnNames: _col1, _col2 + Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col2), count(_col2) + keys: _col1 (type: int) + mode: complete + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (_col1 is not null and _col2 is not null) (type: boolean) + Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ((_col1 / _col2) * 1.2) (type: decimal(38,11)), _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: decimal(38,11)) + + Stage: Stage-0 + Fetch Operator + limit: 100 + Processor Tree: + ListSink + diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out index 092f79028d..e8a23e714b 100644 --- a/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out @@ -115,7 +115,7 @@ Stage-0 Select Operator [SEL_114] (rows=285116600 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_112] (rows=285116600 width=119) - predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null) + predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) TableScan [TS_0] (rows=287989836 width=119) default@catalog_sales,catalog_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["cs_sold_date_sk","cs_item_sk","cs_ext_discount_amt"] <-Reducer 10 [BROADCAST_EDGE] vectorized @@ -160,7 +160,7 @@ Stage-0 Select Operator [SEL_115] (rows=286549727 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=286549727 width=119) - predicate:cs_sold_date_sk is not null + predicate:(cs_sold_date_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Please refer to the previous TableScan [TS_0] <-Map 8 [SIMPLE_EDGE] vectorized SHUFFLE [RS_121] diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out index 7452f08608..0d44384b47 100644 --- a/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out @@ -119,7 +119,7 @@ Stage-0 Select Operator [SEL_114] (rows=143930905 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_112] (rows=143930905 width=119) - predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null) + predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) TableScan [TS_0] (rows=144002668 width=119) default@web_sales,web_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["ws_sold_date_sk","ws_item_sk","ws_ext_discount_amt"] <-Reducer 10 [BROADCAST_EDGE] vectorized @@ -164,7 +164,7 @@ Stage-0 Select Operator [SEL_115] (rows=143966864 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=143966864 width=119) - predicate:ws_sold_date_sk is not null + predicate:(ws_sold_date_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Please refer to the previous TableScan [TS_0] <-Map 8 [SIMPLE_EDGE] vectorized SHUFFLE [RS_121] diff --git a/ql/src/test/results/clientpositive/perf/tez/query1b.q.out b/ql/src/test/results/clientpositive/perf/tez/query1b.q.out new file mode 100644 index 0000000000..c030334818 --- /dev/null +++ b/ql/src/test/results/clientpositive/perf/tez/query1b.q.out @@ -0,0 +1,397 @@ +PREHOOK: query: explain +with customer_total_return as +(select sr_customer_sk as ctr_customer_sk +,sr_store_sk as ctr_store_sk +,sum(SR_FEE) as ctr_total_return +from store_returns +,date_dim +where sr_returned_date_sk = d_date_sk +and d_year =2000 +group by sr_customer_sk +,sr_store_sk) + select c_customer_id +from customer_total_return ctr1 +,store +,customer +where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 +from customer_total_return ctr2 +where ctr1.ctr_store_sk = ctr2.ctr_store_sk) +and s_store_sk = ctr1.ctr_store_sk +and s_state = 'NM' +and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@customer +PREHOOK: Input: default@date_dim +PREHOOK: Input: default@store +PREHOOK: Input: default@store_returns +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain +with customer_total_return as +(select sr_customer_sk as ctr_customer_sk +,sr_store_sk as ctr_store_sk +,sum(SR_FEE) as ctr_total_return +from store_returns +,date_dim +where sr_returned_date_sk = d_date_sk +and d_year =2000 +group by sr_customer_sk +,sr_store_sk) + select c_customer_id +from customer_total_return ctr1 +,store +,customer +where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 +from customer_total_return ctr2 +where ctr1.ctr_store_sk = ctr2.ctr_store_sk) +and s_store_sk = ctr1.ctr_store_sk +and s_state = 'NM' +and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id +limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer +POSTHOOK: Input: default@date_dim +POSTHOOK: Input: default@store +POSTHOOK: Input: default@store_returns +POSTHOOK: Output: hdfs://### HDFS PATH ### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 10 <- Reducer 7 (BROADCAST_EDGE) + Map 3 <- Map 11 (BROADCAST_EDGE), Map 9 (BROADCAST_EDGE), Reducer 2 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (BROADCAST_EDGE), Map 3 (SIMPLE_EDGE) + Reducer 5 <- Map 10 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE), Reducer 8 (BROADCAST_EDGE) + Reducer 6 <- Reducer 5 (SIMPLE_EDGE) + Reducer 7 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 8 <- Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: store + filterExpr: ((s_state = 'NM') and s_store_sk is not null) (type: boolean) + Statistics: Num rows: 1704 Data size: 153360 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((s_state = 'NM') and s_store_sk is not null) (type: boolean) + Statistics: Num rows: 35 Data size: 3150 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: s_store_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=31) + minReductionHashAggr: 0.9714286 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized + Map 10 + Map Operator Tree: + TableScan + alias: customer + filterExpr: (c_customer_sk is not null and c_customer_sk BETWEEN DynamicValue(RS_44_store_returns_sr_customer_sk_min) AND DynamicValue(RS_44_store_returns_sr_customer_sk_max) and in_bloom_filter(c_customer_sk, DynamicValue(RS_44_store_returns_sr_customer_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (c_customer_sk is not null and c_customer_sk BETWEEN DynamicValue(RS_44_store_returns_sr_customer_sk_min) AND DynamicValue(RS_44_store_returns_sr_customer_sk_max) and in_bloom_filter(c_customer_sk, DynamicValue(RS_44_store_returns_sr_customer_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: c_customer_sk (type: int), c_customer_id (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized + Map 11 + Map Operator Tree: + TableScan + alias: date_dim + filterExpr: ((d_year = 2000) and d_date_sk is not null) (type: boolean) + Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((d_year = 2000) and d_date_sk is not null) (type: boolean) + Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_date_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Map 3 + Map Operator Tree: + TableScan + alias: store_returns + filterExpr: (((sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null) or (sr_store_sk is not null and sr_returned_date_sk is not null)) and sr_store_sk BETWEEN DynamicValue(RS_41_store_s_store_sk_min) AND DynamicValue(RS_41_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_41_store_s_store_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 57591150 Data size: 6891360020 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_41_store_s_store_sk_min) AND DynamicValue(RS_41_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_41_store_s_store_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2)) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1, _col2, _col3 + input vertices: + 1 Map 9 + Statistics: Num rows: 16855704 Data size: 1805298496 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Group By Operator + aggregations: sum(_col3) + keys: _col2 (type: int), _col1 (type: int) + minReductionHashAggr: 0.8699312 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: int) + Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: decimal(17,2)) + Filter Operator + predicate: (sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_41_store_s_store_sk_min) AND DynamicValue(RS_41_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_41_store_s_store_sk_bloom_filter))) (type: boolean) + Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2)) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1, _col2, _col3 + input vertices: + 1 Map 11 + Statistics: Num rows: 17467258 Data size: 1870797840 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Group By Operator + aggregations: sum(_col3) + keys: _col2 (type: int), _col1 (type: int) + minReductionHashAggr: 0.85786486 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: decimal(17,2)) + Execution mode: vectorized + Map 9 + Map Operator Tree: + TableScan + alias: date_dim + filterExpr: ((d_year = 2000) and d_date_sk is not null) (type: boolean) + Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((d_year = 2000) and d_date_sk is not null) (type: boolean) + Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_date_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=31) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 4 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: _col2 is not null (type: boolean) + Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: int), _col0 (type: int), _col2 (type: decimal(17,2)) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col1 (type: int) + outputColumnNames: _col1, _col2, _col3 + input vertices: + 0 Map 1 + Statistics: Num rows: 1923224 Data size: 220816368 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 1923224 Data size: 220816368 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: int), _col3 (type: decimal(17,2)) + Select Operator + expressions: _col1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1923224 Data size: 5408304 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=68687) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 5 + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col2, _col3, _col5 + Statistics: Num rows: 1923224 Data size: 410434616 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col2 (type: int) + 1 _col1 (type: int) + outputColumnNames: _col3, _col5, _col6 + input vertices: + 1 Reducer 8 + Statistics: Num rows: 1991910 Data size: 645378840 Basic stats: COMPLETE Column stats: COMPLETE + HybridGraceHashJoin: true + Filter Operator + predicate: (_col3 > _col6) (type: boolean) + Statistics: Num rows: 663970 Data size: 215126280 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col5 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Reducer 6 + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 100 + Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 7 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=68687) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 8 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), _col2 (type: decimal(17,2)) + outputColumnNames: _col1, _col2 + Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col2), count(_col2) + keys: _col1 (type: int) + mode: complete + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (_col1 is not null and _col2 is not null) (type: boolean) + Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ((_col1 / _col2) * 1.2) (type: decimal(38,11)), _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: decimal(38,11)) + + Stage: Stage-0 + Fetch Operator + limit: 100 + Processor Tree: + ListSink + diff --git a/ql/src/test/results/clientpositive/perf/tez/query32.q.out b/ql/src/test/results/clientpositive/perf/tez/query32.q.out index 0b89356db5..97beaf5682 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query32.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query32.q.out @@ -115,7 +115,7 @@ Stage-0 Select Operator [SEL_114] (rows=285116600 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_112] (rows=285116600 width=119) - predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null and cs_item_sk is not null) + predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null and cs_item_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) TableScan [TS_0] (rows=287989836 width=119) default@catalog_sales,catalog_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["cs_sold_date_sk","cs_item_sk","cs_ext_discount_amt"] <-Reducer 10 [BROADCAST_EDGE] vectorized @@ -160,7 +160,7 @@ Stage-0 Select Operator [SEL_115] (rows=286549727 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=286549727 width=119) - predicate:(cs_sold_date_sk is not null and cs_item_sk is not null) + predicate:(cs_sold_date_sk is not null and cs_item_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Please refer to the previous TableScan [TS_0] <-Map 8 [SIMPLE_EDGE] vectorized SHUFFLE [RS_121] diff --git a/ql/src/test/results/clientpositive/perf/tez/query65.q.out b/ql/src/test/results/clientpositive/perf/tez/query65.q.out index 69b12f6947..3ef64f825e 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query65.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query65.q.out @@ -197,6 +197,6 @@ Stage-0 Select Operator [SEL_152] (rows=525329897 width=118) Output:["_col0","_col1","_col2","_col3"] Filter Operator [FIL_150] (rows=525329897 width=118) - predicate:(ss_sold_date_sk is not null and ss_store_sk is not null) + predicate:(ss_sold_date_sk is not null and ss_store_sk is not null and ss_sold_date_sk BETWEEN DynamicValue(RS_7_date_dim_d_date_sk_min) AND DynamicValue(RS_7_date_dim_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_date_dim_d_date_sk_bloom_filter))) Please refer to the previous TableScan [TS_0] diff --git a/ql/src/test/results/clientpositive/perf/tez/query92.q.out b/ql/src/test/results/clientpositive/perf/tez/query92.q.out index 0a0d54e79a..edb8961c47 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query92.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query92.q.out @@ -119,7 +119,7 @@ Stage-0 Select Operator [SEL_114] (rows=143930905 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_112] (rows=143930905 width=119) - predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null and ws_item_sk is not null) + predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null and ws_item_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) TableScan [TS_0] (rows=144002668 width=119) default@web_sales,web_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["ws_sold_date_sk","ws_item_sk","ws_ext_discount_amt"] <-Reducer 10 [BROADCAST_EDGE] vectorized @@ -164,7 +164,7 @@ Stage-0 Select Operator [SEL_115] (rows=143966864 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=143966864 width=119) - predicate:(ws_sold_date_sk is not null and ws_item_sk is not null) + predicate:(ws_sold_date_sk is not null and ws_item_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Please refer to the previous TableScan [TS_0] <-Map 8 [SIMPLE_EDGE] vectorized SHUFFLE [RS_121]