diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java index 4e6cce99c4..38affd2c8e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java @@ -22,12 +22,17 @@ import org.apache.calcite.rel.core.Aggregate; import org.apache.calcite.rel.core.JoinInfo; import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexNode; import org.apache.calcite.util.ImmutableBitSet; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + /** * Planner rule that removes a {@code Aggregate} from a HiveSemiJoin @@ -37,43 +42,58 @@ protected static final Logger LOG = LoggerFactory.getLogger(HiveRemoveGBYSemiJoinRule.class); public static final HiveRemoveGBYSemiJoinRule INSTANCE = - new HiveRemoveGBYSemiJoinRule() ; + new HiveRemoveGBYSemiJoinRule(); public HiveRemoveGBYSemiJoinRule() { super( - operand(HiveSemiJoin.class, - some( - operand(RelNode.class, any()), - operand(Aggregate.class, any()))), + operand(HiveSemiJoin.class, + some( + operand(RelNode.class, any()), + operand(Aggregate.class, any()))), HiveRelFactories.HIVE_BUILDER, "HiveRemoveGBYSemiJoinRule"); } - @Override public void onMatch(RelOptRuleCall call) { - final HiveSemiJoin semijoin= call.rel(0); + @Override public void onMatch(RelOptRuleCall call) { + final HiveSemiJoin semijoin= call.rel(0); - if(semijoin.getJoinType() != JoinRelType.INNER) { - return; - } - final RelNode left = call.rel(1); - final Aggregate rightAggregate= call.rel(2); + if(semijoin.getJoinType() != JoinRelType.INNER) { + return; + } + final RelNode left = call.rel(1); + final Aggregate rightAggregate= call.rel(2); - // if grouping sets are involved do early return - if(rightAggregate.indicator) { - return; - } + // if grouping sets are involved do early return + if(rightAggregate.indicator) { + return; + } - // if there is any aggregate function this group by is not un-necessary - if(!rightAggregate.getAggCallList().isEmpty()) { - return; - } - final JoinInfo joinInfo = semijoin.analyzeCondition(); + // if there is any aggregate function this group by is not un-necessary + if(!rightAggregate.getAggCallList().isEmpty()) { + return; + } + final JoinInfo joinInfo = semijoin.analyzeCondition(); - boolean shouldTransform = joinInfo.rightSet().equals( - ImmutableBitSet.range(rightAggregate.getGroupCount())); - if(shouldTransform) { - RelNode newSemiJoin = call.builder().push(left).push(rightAggregate.getInput()).semiJoin(semijoin.getCondition()).build(); - call.transformTo(newSemiJoin); + boolean shouldTransform = joinInfo.rightSet().equals( + ImmutableBitSet.range(rightAggregate.getGroupCount())); + if(shouldTransform) { + List gbyKeys = rightAggregate.getGroupSet().asList(); + List projects = new ArrayList<>(); + + RexBuilder rexBuilder = semijoin.getCluster().getRexBuilder(); + // since group by could change the rowschema we will need to create project + // to preserve the schema + for(int i=0; i 100; select a.key from t3 a left semi join t2 b on a.value = b.value where a.key > 100; + +explain select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value; +select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value; diff --git a/ql/src/test/results/clientpositive/llap/semijoin.q.out b/ql/src/test/results/clientpositive/llap/semijoin.q.out index f788ae3f11..82cee338b1 100644 --- a/ql/src/test/results/clientpositive/llap/semijoin.q.out +++ b/ql/src/test/results/clientpositive/llap/semijoin.q.out @@ -2745,3 +2745,152 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@t2 POSTHOOK: Input: default@t3 #### A masked pattern was here #### +Warning: Shuffle Join MERGEJOIN[30][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product +PREHOOK: query: explain select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +PREHOOK: type: QUERY +POSTHOOK: query: explain select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) + Reducer 3 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: outr + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Map 4 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: value is not null (type: boolean) + Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: value (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + residual filter predicates: {(_col1 > _col2)} + Statistics: Num rows: 41666 Data size: 11208154 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 41666 Data size: 7416548 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string), _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10609 Data size: 1888402 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 10609 Data size: 1888402 Basic stats: COMPLETE Column stats: COMPLETE + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[30][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product +PREHOOK: query: select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### diff --git a/ql/src/test/results/clientpositive/spark/semijoin.q.out b/ql/src/test/results/clientpositive/spark/semijoin.q.out index 8f5354c4cf..6932efa550 100644 --- a/ql/src/test/results/clientpositive/spark/semijoin.q.out +++ b/ql/src/test/results/clientpositive/spark/semijoin.q.out @@ -2590,3 +2590,151 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@t2 POSTHOOK: Input: default@t3 #### A masked pattern was here #### +Warning: Shuffle Join JOIN[15][tables = [$hdt$_1, $hdt$_2]] in Work 'Reducer 4' is a cross product +PREHOOK: query: explain select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +PREHOOK: type: QUERY +POSTHOOK: query: explain select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Reducer 4 (PARTITION-LEVEL SORT, 2) + Reducer 4 <- Map 3 (PARTITION-LEVEL SORT, 1), Reducer 6 (PARTITION-LEVEL SORT, 1) + Reducer 6 <- Map 5 (GROUP, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: outr + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map 3 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Map 5 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: value is not null (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: value (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 45832 Data size: 1019683 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 45832 Data size: 1019683 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 125000 Data size: 2781000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (_col1 > _col2) (type: boolean) + Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string), _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE + Reducer 6 + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[15][tables = [$hdt$_1, $hdt$_2]] in Work 'Reducer 4' is a cross product +PREHOOK: query: select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select key, value from src outr left semi join + (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr + on outr.key=inr.key and outr.value=inr.value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here ####