diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 362a796..202f136 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1389,6 +1389,7 @@ spark.query.files=add_part_multiple.q, \ spark.only.query.files=spark_combine_equivalent_work.q,\ spark_dynamic_partition_pruning.q,\ spark_dynamic_partition_pruning_2.q,\ + spark_dynamic_partition_pruning_3.q,\ dynamic_rdd_cache.q, \ spark_multi_insert_parallel_orderby.q,\ spark_explainuser_1.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkRemoveDynamicPruningBySize.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkRemoveDynamicPruningBySize.java index 26a1088..7dd14ce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkRemoveDynamicPruningBySize.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkRemoveDynamicPruningBySize.java @@ -53,7 +53,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procContext, SparkPartitionPruningSinkDesc desc = op.getConf(); if (desc.getStatistics().getDataSize() > context.getConf() - .getLongVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE)) { + .getLongVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE) && + (context.pruningOpsRemovedByPriorOpt.isEmpty() || !context.pruningOpsRemovedByPriorOpt.contains(op))) { + context.pruningOpsRemovedByPriorOpt.add(op); OperatorUtils.removeBranch(op); // at this point we've found the fork in the op pipeline that has the pruning as a child plan. LOG.info("Disabling dynamic pruning for: " diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java index 81c2348..6348f73 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java @@ -19,15 +19,21 @@ package org.apache.hadoop.hive.ql.optimizer.spark; import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Stack; import com.google.common.collect.Sets; +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator; +import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -456,6 +462,34 @@ public MapJoinOperator convertJoinMapJoin(JoinOperator joinOp, OptimizeSparkProc Operator parentBigTableOp = mapJoinOp.getParentOperators().get(bigTablePosition); if (parentBigTableOp instanceof ReduceSinkOperator) { + + for (Operator p : parentBigTableOp.getParentOperators()) { + // we might have generated a dynamic partition operator chain. Since + // we're removing the reduce sink we need do remove that too. + Set> dynamicPartitionOperators = new HashSet>(); + Map, SparkPartitionPruningSinkOperator> opEventPairs = new HashMap<>(); + for (Operator c : p.getChildOperators()) { + SparkPartitionPruningSinkOperator event = findDynamicPartitionBroadcast(c); + if (event != null) { + dynamicPartitionOperators.add(c); + opEventPairs.put(c, event); + } + } + for (Operator c : dynamicPartitionOperators) { + if (context.pruningOpsRemovedByPriorOpt.isEmpty() || + !context.pruningOpsRemovedByPriorOpt.contains(opEventPairs.get(c))) { + p.removeChild(c); + // at this point we've found the fork in the op pipeline that has the pruning as a child plan. + LOG.info("Disabling dynamic pruning for: " + + ((SparkPartitionPruningSinkDesc) opEventPairs.get(c).getConf()).getTableScan().getName() + + ". Need to be removed together with reduce sink"); + } + } + for (Operator op : dynamicPartitionOperators) { + context.pruningOpsRemovedByPriorOpt.add(opEventPairs.get(op)); + } + } + mapJoinOp.getParentOperators().remove(bigTablePosition); if (!(mapJoinOp.getParentOperators().contains(parentBigTableOp.getParentOperators().get(0)))) { mapJoinOp.getParentOperators().add(bigTablePosition, @@ -476,6 +510,32 @@ public MapJoinOperator convertJoinMapJoin(JoinOperator joinOp, OptimizeSparkProc return mapJoinOp; } + private SparkPartitionPruningSinkOperator findDynamicPartitionBroadcast(Operator parent) { + + for (Operator op : parent.getChildOperators()) { + while (op != null) { + if (op instanceof SparkPartitionPruningSinkOperator && op.getConf() instanceof SparkPartitionPruningSinkDesc) { + // found dynamic partition pruning operator + return (SparkPartitionPruningSinkOperator)op; + } + if (op instanceof ReduceSinkOperator || op instanceof FileSinkOperator) { + // crossing reduce sink or file sink means the pruning isn't for this parent. + break; + } + + if (op.getChildOperators().size() != 1) { + // dynamic partition pruning pipeline doesn't have multiple children + break; + } + + op = op.getChildOperators().get(0); + } + } + + return null; + } + + private boolean containUnionWithoutRS(Operator op) { boolean result = false; if (op instanceof UnionOperator) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/OptimizeSparkProcContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/OptimizeSparkProcContext.java index 3b71af1..e05ed81 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/OptimizeSparkProcContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/OptimizeSparkProcContext.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.parse.spark; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.hooks.ReadEntity; @@ -43,6 +44,20 @@ private final HiveConf conf; private final Set inputs; private final Set outputs; + + /* Two of the optimization rules, ConvertJoinMapJoin and RemoveDynamicPruningBySize, are put into + stats dependent optimizations and run together in TezCompiler. There's no guarantee which one + runs first, but in either case, the prior one may have removed a chain which the latter one is + not aware of. So we need to remember the leaf node(s) of that chain so it can be skipped. + + For example, as ConvertJoinMapJoin is removing the reduce sink, it may also have removed a + dynamic partition pruning operator chain. However, RemoveDynamicPruningBySize doesn't know this + and still tries to traverse that removed chain which will cause NPE. + + This may also happen when RemoveDynamicPruningBySize happens first. + */ + public HashSet pruningOpsRemovedByPriorOpt; + private final Set visitedReduceSinks = new HashSet(); private final Map mjOpSizes = new HashMap(); @@ -52,6 +67,7 @@ public OptimizeSparkProcContext(HiveConf conf, ParseContext parseContext, this.parseContext = parseContext; this.inputs = inputs; this.outputs = outputs; + this.pruningOpsRemovedByPriorOpt = new HashSet<>(); } public ParseContext getParseContext() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java index 682b987..ebae9ff 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java @@ -170,7 +170,9 @@ private void removeDPPOperator(Set> component, OptimizeSparkProcCont } } - if (toRemove == null) { + if (toRemove == null|| + (!context.pruningOpsRemovedByPriorOpt.isEmpty() && + context.pruningOpsRemovedByPriorOpt.contains(toRemove))) { return; } diff --git a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_2.q b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_2.q index 734f187..2202709 100644 --- a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_2.q +++ b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_2.q @@ -116,3 +116,19 @@ SELECT amount FROM agg_01, dim_shops WHERE dim_shops_id = id AND label = 'bar'; SELECT amount FROM agg_01, dim_shops WHERE dim_shops_id = id AND label = 'foo' UNION ALL SELECT amount FROM agg_01, dim_shops WHERE dim_shops_id = id AND label = 'bar'; + +set hive.spark.dynamic.partition.pruning.max.data.size=10000; +-- Dynamic partition pruning will be removed as data size exceeds the limit; +-- and for self join on partitioning column, it should not fail (HIVE-10559). +explain +select count(*) +from srcpart s1, + srcpart s2 +where s1.ds = s2.ds +; + +select count(*) +from srcpart s1, + srcpart s2 +where s1.ds = s2.ds +; \ No newline at end of file diff --git a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q new file mode 100644 index 0000000..0364d87 --- /dev/null +++ b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q @@ -0,0 +1,15 @@ +set hive.spark.dynamic.partition.pruning=true; +set hive.auto.convert.join=true; + +create table partitioned_table1 (col int) partitioned by (part_col int); +create table partitioned_table2 (col int) partitioned by (part_col int); +create table regular_table (col int); +insert into table regular_table values (1); + +alter table partitioned_table1 add partition (part_col = 1); +insert into table partitioned_table1 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10); + +alter table partitioned_table2 add partition (part_col = 1); +insert into table partitioned_table2 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10); + +explain select * from partitioned_table1 where partitioned_table1.part_col in (select regular_table.col from regular_table join partitioned_table2 on regular_table.col = partitioned_table2.part_col); diff --git a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out index d476172..f1092c5 100644 --- a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out +++ b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out @@ -4978,8 +4978,7 @@ POSTHOOK: query: EXPLAIN select count(*) from srcpart_date left join srcpart on POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-3 + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: @@ -4987,38 +4986,6 @@ STAGE PLANS: Spark #### A masked pattern was here #### Vertices: - Map 4 - Map Operator Tree: - TableScan - alias: srcpart_date - filterExpr: (date = '2008-04-08') (type: boolean) - Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE - Filter Operator - predicate: (date = '2008-04-08') (type: boolean) - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: ds (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Group By Operator - keys: _col0 (type: string) - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Spark Partition Pruning Sink Operator - partition key expr: ds - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - target column name: ds - target work: Map 3 - - Stage: Stage-3 - Spark -#### A masked pattern was here #### - Vertices: Map 3 Map Operator Tree: TableScan @@ -5101,8 +5068,7 @@ POSTHOOK: query: EXPLAIN select count(*) from srcpart full outer join srcpart_da POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-2 is a root stage - Stage-3 depends on stages: Stage-2 - Stage-1 depends on stages: Stage-3 + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: @@ -5110,38 +5076,6 @@ STAGE PLANS: Spark #### A masked pattern was here #### Vertices: - Map 4 - Map Operator Tree: - TableScan - alias: srcpart_date - filterExpr: (date = '2008-04-08') (type: boolean) - Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE - Filter Operator - predicate: (date = '2008-04-08') (type: boolean) - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: ds (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Group By Operator - keys: _col0 (type: string) - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Spark Partition Pruning Sink Operator - partition key expr: ds - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - target column name: ds - target work: Map 1 - - Stage: Stage-3 - Spark -#### A masked pattern was here #### - Vertices: Map 1 Map Operator Tree: TableScan diff --git a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_2.q.out b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_2.q.out index 3e69f3f..d902694 100644 --- a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_2.q.out +++ b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_2.q.out @@ -1027,3 +1027,122 @@ POSTHOOK: Input: default@dim_shops 4 5 6 +PREHOOK: query: explain +select count(*) +from srcpart s1, + srcpart s2 +where s1.ds = s2.ds +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(*) +from srcpart s1, + srcpart s2 +where s1.ds = s2.ds +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 + Map Operator Tree: + TableScan + alias: s2 + filterExpr: ds is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ds (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: s1 + filterExpr: ds is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ds (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + input vertices: + 1 Map 3 + Statistics: Num rows: 2200 Data size: 23372 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) +from srcpart s1, + srcpart s2 +where s1.ds = s2.ds +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) +from srcpart s1, + srcpart s2 +where s1.ds = s2.ds +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +#### A masked pattern was here #### +2000000 diff --git a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_3.q.out b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_3.q.out new file mode 100644 index 0000000..09698f1 --- /dev/null +++ b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning_3.q.out @@ -0,0 +1,195 @@ +PREHOOK: query: create table partitioned_table1 (col int) partitioned by (part_col int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@partitioned_table1 +POSTHOOK: query: create table partitioned_table1 (col int) partitioned by (part_col int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@partitioned_table1 +PREHOOK: query: create table partitioned_table2 (col int) partitioned by (part_col int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@partitioned_table2 +POSTHOOK: query: create table partitioned_table2 (col int) partitioned by (part_col int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@partitioned_table2 +PREHOOK: query: create table regular_table (col int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@regular_table +POSTHOOK: query: create table regular_table (col int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@regular_table +PREHOOK: query: insert into table regular_table values (1) +PREHOOK: type: QUERY +PREHOOK: Output: default@regular_table +POSTHOOK: query: insert into table regular_table values (1) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@regular_table +POSTHOOK: Lineage: regular_table.col EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: alter table partitioned_table1 add partition (part_col = 1) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@partitioned_table1 +POSTHOOK: query: alter table partitioned_table1 add partition (part_col = 1) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@partitioned_table1 +POSTHOOK: Output: default@partitioned_table1@part_col=1 +PREHOOK: query: insert into table partitioned_table1 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10) +PREHOOK: type: QUERY +PREHOOK: Output: default@partitioned_table1@part_col=1 +POSTHOOK: query: insert into table partitioned_table1 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@partitioned_table1@part_col=1 +POSTHOOK: Lineage: partitioned_table1 PARTITION(part_col=1).col EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: alter table partitioned_table2 add partition (part_col = 1) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@partitioned_table2 +POSTHOOK: query: alter table partitioned_table2 add partition (part_col = 1) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@partitioned_table2 +POSTHOOK: Output: default@partitioned_table2@part_col=1 +PREHOOK: query: insert into table partitioned_table2 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10) +PREHOOK: type: QUERY +PREHOOK: Output: default@partitioned_table2@part_col=1 +POSTHOOK: query: insert into table partitioned_table2 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@partitioned_table2@part_col=1 +POSTHOOK: Lineage: partitioned_table2 PARTITION(part_col=1).col EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain select * from partitioned_table1 where partitioned_table1.part_col in (select regular_table.col from regular_table join partitioned_table2 on regular_table.col = partitioned_table2.part_col) +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from partitioned_table1 where partitioned_table1.part_col in (select regular_table.col from regular_table join partitioned_table2 on regular_table.col = partitioned_table2.part_col) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-3 is a root stage + Stage-2 depends on stages: Stage-3 + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-3 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: regular_table + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: col is not null (type: boolean) + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE + Spark Partition Pruning Sink Operator + partition key expr: part_col + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE + target column name: part_col + target work: Map 3 + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 + Map Operator Tree: + TableScan + alias: partitioned_table2 + Statistics: Num rows: 10 Data size: 11 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part_col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 10 Data size: 11 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + input vertices: + 0 Map 2 + Statistics: Num rows: 11 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 11 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Spark Partition Pruning Sink Operator + partition key expr: part_col + Statistics: Num rows: 11 Data size: 12 Basic stats: COMPLETE Column stats: NONE + target column name: part_col + target work: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: partitioned_table1 + Statistics: Num rows: 10 Data size: 11 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: col (type: int), part_col (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10 Data size: 11 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + input vertices: + 1 Map 3 + Statistics: Num rows: 12 Data size: 13 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 13 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +