diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 25a6f12253..46fde13a04 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -31,6 +31,7 @@ import java.util.Set; import java.util.Stack; +import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.Context; @@ -2176,21 +2177,29 @@ private float getSelectivityComplexTree(Operator op) { // corresponding branch since only that branch will factor is the reduction if (multiParentOp instanceof JoinOperator) { JoinOperator jop = ((JoinOperator) multiParentOp); - isSelComputed = true; // check for two way join if (jop.getConf().getConds().length == 1) { - switch (jop.getConf().getCondsList().get(0).getType()) { - case JoinDesc.LEFT_OUTER_JOIN: - selMultiParent *= getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0)); - break; - case JoinDesc.RIGHT_OUTER_JOIN: - selMultiParent *= getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1)); - break; - default: - // for rest of the join type we will take min of the reduction. + isSelComputed = true; + int type = jop.getConf().getCondsList().get(0).getType(); + if (jop.getConf().getJoinKeys()[0].length == 0 || type == JoinDesc.FULL_OUTER_JOIN) { + // This is just a cartesian product or a full outer join, we will take the max float selMultiParentLeft = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0)); float selMultiParentRight = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1)); - selMultiParent = Math.min(selMultiParentLeft, selMultiParentRight); + selMultiParent = Math.max(selMultiParentLeft, selMultiParentRight); + } else { + switch (type) { + case JoinDesc.LEFT_OUTER_JOIN: + selMultiParent = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0)); + break; + case JoinDesc.RIGHT_OUTER_JOIN: + selMultiParent = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1)); + break; + default: + // for rest of the join type we will take min of the reduction. + float selMultiParentLeft = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0)); + float selMultiParentRight = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1)); + selMultiParent = Math.min(selMultiParentLeft, selMultiParentRight); + } } } } diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query6.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query6.q.out index 959ddd0850..5e3deb3eb9 100644 --- a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query6.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query6.q.out @@ -1,4 +1,4 @@ -Warning: Map Join MAPJOIN[170][bigTable=?] in task 'Map 8' is a cross product +Warning: Map Join MAPJOIN[170][bigTable=?] in task 'Map 11' is a cross product PREHOOK: query: explain cbo select a.ca_state state, count(*) cnt from customer_address a diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query6.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query6.q.out index 14528e2354..7aea119ac3 100644 --- a/ql/src/test/results/clientpositive/perf/tez/constraints/query6.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query6.q.out @@ -1,4 +1,4 @@ -Warning: Map Join MAPJOIN[170][bigTable=?] in task 'Map 8' is a cross product +Warning: Map Join MAPJOIN[170][bigTable=?] in task 'Map 11' is a cross product PREHOOK: query: explain select a.ca_state state, count(*) cnt from customer_address a @@ -65,157 +65,173 @@ Plan optimized by CBO. Vertex dependency in root stage Map 1 <- Reducer 3 (BROADCAST_EDGE) -Map 11 <- Reducer 15 (BROADCAST_EDGE), Reducer 7 (BROADCAST_EDGE) -Map 6 <- Map 1 (BROADCAST_EDGE) -Map 8 <- Reducer 5 (BROADCAST_EDGE) -Reducer 12 <- Map 11 (SIMPLE_EDGE) -Reducer 13 <- Reducer 12 (SIMPLE_EDGE) -Reducer 15 <- Map 14 (SIMPLE_EDGE) +Map 11 <- Reducer 5 (BROADCAST_EDGE) +Map 14 <- Reducer 17 (BROADCAST_EDGE) +Map 6 <- Map 1 (BROADCAST_EDGE), Reducer 15 (BROADCAST_EDGE) +Reducer 10 <- Reducer 9 (SIMPLE_EDGE) +Reducer 12 <- Map 11 (SIMPLE_EDGE), Map 13 (SIMPLE_EDGE) +Reducer 15 <- Map 14 (CUSTOM_SIMPLE_EDGE) +Reducer 17 <- Map 16 (SIMPLE_EDGE) Reducer 3 <- Map 2 (SIMPLE_EDGE) Reducer 4 <- Map 2 (SIMPLE_EDGE) Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) -Reducer 7 <- Map 6 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) -Reducer 9 <- Map 10 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) +Reducer 7 <- Map 6 (SIMPLE_EDGE), Reducer 12 (SIMPLE_EDGE) +Reducer 8 <- Map 14 (SIMPLE_EDGE), Reducer 7 (SIMPLE_EDGE) +Reducer 9 <- Reducer 8 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 13 vectorized - File Output Operator [FS_231] - Limit [LIM_230] (rows=1 width=94) + Reducer 10 vectorized + File Output Operator [FS_233] + Limit [LIM_232] (rows=1 width=94) Number of rows:100 - Select Operator [SEL_229] (rows=1 width=94) + Select Operator [SEL_231] (rows=1 width=94) Output:["_col0","_col1"] - <-Reducer 12 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_228] - Filter Operator [FIL_227] (rows=1 width=94) + <-Reducer 9 [SIMPLE_EDGE] vectorized + SHUFFLE [RS_230] + Filter Operator [FIL_229] (rows=1 width=94) predicate:(_col1 >= 10L) - Group By Operator [GBY_226] (rows=1 width=94) + Group By Operator [GBY_228] (rows=1 width=94) Output:["_col0","_col1"],aggregations:["count(VALUE._col0)"],keys:KEY._col0 - <-Map 11 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_225] + <-Reducer 8 [SIMPLE_EDGE] + SHUFFLE [RS_68] PartitionCols:_col0 - Group By Operator [GBY_224] (rows=1 width=94) + Group By Operator [GBY_67] (rows=1 width=94) Output:["_col0","_col1"],aggregations:["count()"],keys:_col10 - Map Join Operator [MAPJOIN_223] (rows=1 width=86) - Conds:RS_63._col4=SEL_222._col0(Inner),HybridGraceHashJoin:true,Output:["_col10"] - <-Reducer 7 [BROADCAST_EDGE] - BROADCAST [RS_63] + Merge Join Operator [MERGEJOIN_174] (rows=500 width=86) + Conds:RS_63._col4=RS_204._col0(Inner),Output:["_col10"] + <-Map 14 [SIMPLE_EDGE] vectorized + SHUFFLE [RS_204] + PartitionCols:_col0 + Select Operator [SEL_203] (rows=154000 width=227) + Output:["_col0"] + Filter Operator [FIL_202] (rows=154000 width=227) + predicate:(_col1 > _col4) + Map Join Operator [MAPJOIN_201] (rows=462000 width=227) + Conds:SEL_200._col2=RS_198._col0(Inner),HybridGraceHashJoin:true,Output:["_col0","_col1","_col4"] + <-Reducer 17 [BROADCAST_EDGE] vectorized + BROADCAST [RS_198] + PartitionCols:_col0 + Select Operator [SEL_197] (rows=10 width=202) + Output:["_col0","_col1"] + Group By Operator [GBY_196] (rows=10 width=210) + Output:["_col0","_col1","_col2"],aggregations:["sum(VALUE._col0)","count(VALUE._col1)"],keys:KEY._col0 + <-Map 16 [SIMPLE_EDGE] vectorized + SHUFFLE [RS_195] + PartitionCols:_col0 + Group By Operator [GBY_194] (rows=10 width=210) + Output:["_col0","_col1","_col2"],aggregations:["sum(i_current_price)","count(i_current_price)"],keys:i_category + Filter Operator [FIL_193] (rows=462000 width=201) + predicate:i_category is not null + TableScan [TS_42] (rows=462000 width=201) + default@item,j,Tbl:COMPLETE,Col:COMPLETE,Output:["i_current_price","i_category"] + <-Select Operator [SEL_200] (rows=462000 width=205) + Output:["_col0","_col1","_col2"] + Filter Operator [FIL_199] (rows=462000 width=205) + predicate:i_category is not null + TableScan [TS_39] (rows=462000 width=205) + default@item,i,Tbl:COMPLETE,Col:COMPLETE,Output:["i_item_sk","i_current_price","i_category"] + <-Reducer 7 [SIMPLE_EDGE] + SHUFFLE [RS_63] PartitionCols:_col4 - Merge Join Operator [MERGEJOIN_173] (rows=4923 width=90) - Conds:RS_197._col5=RS_61._col0(Inner),Output:["_col4","_col10"] + Merge Join Operator [MERGEJOIN_173] (rows=7192227 width=90) + Conds:RS_213._col5=RS_61._col0(Inner),Output:["_col4","_col10"] <-Map 6 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_197] + SHUFFLE [RS_213] PartitionCols:_col5 - Map Join Operator [MAPJOIN_196] (rows=7192227 width=4) - Conds:RS_193._col0=SEL_195._col0(Inner),HybridGraceHashJoin:true,Output:["_col4","_col5"] + Map Join Operator [MAPJOIN_212] (rows=7192227 width=4) + Conds:RS_192._col0=SEL_211._col0(Inner),HybridGraceHashJoin:true,Output:["_col4","_col5"] <-Map 1 [BROADCAST_EDGE] vectorized - BROADCAST [RS_193] + BROADCAST [RS_192] PartitionCols:_col0 - Map Join Operator [MAPJOIN_192] (rows=660 width=4) - Conds:SEL_191._col1=RS_189._col0(Inner),HybridGraceHashJoin:true,Output:["_col0"] + Map Join Operator [MAPJOIN_191] (rows=660 width=4) + Conds:SEL_190._col1=RS_188._col0(Inner),HybridGraceHashJoin:true,Output:["_col0"] <-Reducer 3 [BROADCAST_EDGE] vectorized - BROADCAST [RS_189] + BROADCAST [RS_188] PartitionCols:_col0 - Group By Operator [GBY_188] (rows=25 width=4) + Group By Operator [GBY_187] (rows=25 width=4) Output:["_col0"],keys:KEY._col0 <-Map 2 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_186] + SHUFFLE [RS_185] PartitionCols:_col0 - Group By Operator [GBY_184] (rows=25 width=4) + Group By Operator [GBY_183] (rows=25 width=4) Output:["_col0"],keys:d_month_seq - Select Operator [SEL_182] (rows=50 width=12) + Select Operator [SEL_181] (rows=50 width=12) Output:["d_month_seq"] - Filter Operator [FIL_180] (rows=50 width=12) + Filter Operator [FIL_179] (rows=50 width=12) predicate:((d_moy = 2) and (d_year = 2000) and d_month_seq is not null) TableScan [TS_3] (rows=73049 width=12) default@date_dim,date_dim,Tbl:COMPLETE,Col:COMPLETE,Output:["d_month_seq","d_year","d_moy"] - <-Select Operator [SEL_191] (rows=73049 width=8) + <-Select Operator [SEL_190] (rows=73049 width=8) Output:["_col0","_col1"] - Filter Operator [FIL_190] (rows=73049 width=8) + Filter Operator [FIL_189] (rows=73049 width=8) predicate:d_month_seq is not null TableScan [TS_0] (rows=73049 width=8) default@date_dim,d,Tbl:COMPLETE,Col:COMPLETE,Output:["d_date_sk","d_month_seq"] - <-Select Operator [SEL_195] (rows=525327388 width=11) + <-Select Operator [SEL_211] (rows=525327388 width=11) Output:["_col0","_col1","_col2"] - Filter Operator [FIL_194] (rows=525327388 width=11) - predicate:(ss_customer_sk is not null and ss_sold_date_sk is not null) + Filter Operator [FIL_210] (rows=525327388 width=11) + predicate:((ss_item_sk BETWEEN DynamicValue(RS_64_i_i_item_sk_min) AND DynamicValue(RS_64_i_i_item_sk_max) and in_bloom_filter(ss_item_sk, DynamicValue(RS_64_i_i_item_sk_bloom_filter))) and ss_customer_sk is not null and ss_sold_date_sk is not null) TableScan [TS_10] (rows=575995635 width=11) default@store_sales,s,Tbl:COMPLETE,Col:COMPLETE,Output:["ss_sold_date_sk","ss_item_sk","ss_customer_sk"] - <-Reducer 9 [SIMPLE_EDGE] + <-Reducer 15 [BROADCAST_EDGE] vectorized + BROADCAST [RS_209] + Group By Operator [GBY_208] (rows=1 width=12) + Output:["_col0","_col1","_col2"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","bloom_filter(VALUE._col2, expectedEntries=1000000)"] + <-Map 14 [CUSTOM_SIMPLE_EDGE] vectorized + SHUFFLE [RS_207] + Group By Operator [GBY_206] (rows=1 width=12) + Output:["_col0","_col1","_col2"],aggregations:["min(_col0)","max(_col0)","bloom_filter(_col0, expectedEntries=1000000)"] + Select Operator [SEL_205] (rows=154000 width=4) + Output:["_col0"] + Please refer to the previous Select Operator [SEL_203] + <-Reducer 12 [SIMPLE_EDGE] SHUFFLE [RS_61] PartitionCols:_col0 Merge Join Operator [MERGEJOIN_171] (rows=80000000 width=90) - Conds:RS_209._col1=RS_211._col0(Inner),Output:["_col0","_col4"] - <-Map 10 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_211] - PartitionCols:_col0 - Select Operator [SEL_210] (rows=40000000 width=90) - Output:["_col0","_col1"] - TableScan [TS_30] (rows=40000000 width=90) - default@customer_address,a,Tbl:COMPLETE,Col:COMPLETE,Output:["ca_address_sk","ca_state"] - <-Map 8 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_209] + Conds:RS_225._col1=RS_227._col0(Inner),Output:["_col0","_col4"] + <-Map 11 [SIMPLE_EDGE] vectorized + SHUFFLE [RS_225] PartitionCols:_col1 - Map Join Operator [MAPJOIN_208] (rows=80000000 width=8) + Map Join Operator [MAPJOIN_224] (rows=80000000 width=8) Conds:(Inner),Output:["_col0","_col1"] <-Reducer 5 [BROADCAST_EDGE] vectorized - BROADCAST [RS_205] - Select Operator [SEL_204] (rows=1 width=8) - Filter Operator [FIL_203] (rows=1 width=8) + BROADCAST [RS_221] + Select Operator [SEL_220] (rows=1 width=8) + Filter Operator [FIL_219] (rows=1 width=8) predicate:(sq_count_check(_col0) <= 1) - Group By Operator [GBY_202] (rows=1 width=8) + Group By Operator [GBY_218] (rows=1 width=8) Output:["_col0"],aggregations:["count(VALUE._col0)"] <-Reducer 4 [CUSTOM_SIMPLE_EDGE] vectorized - PARTITION_ONLY_SHUFFLE [RS_201] - Group By Operator [GBY_200] (rows=1 width=8) + PARTITION_ONLY_SHUFFLE [RS_217] + Group By Operator [GBY_216] (rows=1 width=8) Output:["_col0"],aggregations:["count()"] - Select Operator [SEL_199] (rows=25 width=4) - Group By Operator [GBY_198] (rows=25 width=4) + Select Operator [SEL_215] (rows=25 width=4) + Group By Operator [GBY_214] (rows=25 width=4) Output:["_col0"],keys:KEY._col0 <-Map 2 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_187] + SHUFFLE [RS_186] PartitionCols:_col0 - Group By Operator [GBY_185] (rows=25 width=4) + Group By Operator [GBY_184] (rows=25 width=4) Output:["_col0"],keys:d_month_seq - Select Operator [SEL_183] (rows=50 width=12) + Select Operator [SEL_182] (rows=50 width=12) Output:["d_month_seq"] - Filter Operator [FIL_181] (rows=50 width=12) + Filter Operator [FIL_180] (rows=50 width=12) predicate:((d_moy = 2) and (d_year = 2000)) Please refer to the previous TableScan [TS_3] - <-Select Operator [SEL_207] (rows=80000000 width=8) + <-Select Operator [SEL_223] (rows=80000000 width=8) Output:["_col0","_col1"] - Filter Operator [FIL_206] (rows=80000000 width=8) + Filter Operator [FIL_222] (rows=80000000 width=8) predicate:c_current_addr_sk is not null TableScan [TS_13] (rows=80000000 width=8) default@customer,c,Tbl:COMPLETE,Col:COMPLETE,Output:["c_customer_sk","c_current_addr_sk"] - <-Select Operator [SEL_222] (rows=154000 width=227) - Output:["_col0"] - Filter Operator [FIL_221] (rows=154000 width=227) - predicate:(_col1 > _col4) - Map Join Operator [MAPJOIN_220] (rows=462000 width=227) - Conds:SEL_219._col2=RS_217._col0(Inner),HybridGraceHashJoin:true,Output:["_col0","_col1","_col4"] - <-Reducer 15 [BROADCAST_EDGE] vectorized - BROADCAST [RS_217] - PartitionCols:_col0 - Select Operator [SEL_216] (rows=10 width=202) - Output:["_col0","_col1"] - Group By Operator [GBY_215] (rows=10 width=210) - Output:["_col0","_col1","_col2"],aggregations:["sum(VALUE._col0)","count(VALUE._col1)"],keys:KEY._col0 - <-Map 14 [SIMPLE_EDGE] vectorized - SHUFFLE [RS_214] - PartitionCols:_col0 - Group By Operator [GBY_213] (rows=10 width=210) - Output:["_col0","_col1","_col2"],aggregations:["sum(i_current_price)","count(i_current_price)"],keys:i_category - Filter Operator [FIL_212] (rows=462000 width=201) - predicate:i_category is not null - TableScan [TS_42] (rows=462000 width=201) - default@item,j,Tbl:COMPLETE,Col:COMPLETE,Output:["i_current_price","i_category"] - <-Select Operator [SEL_219] (rows=462000 width=205) - Output:["_col0","_col1","_col2"] - Filter Operator [FIL_218] (rows=462000 width=205) - predicate:i_category is not null - TableScan [TS_39] (rows=462000 width=205) - default@item,i,Tbl:COMPLETE,Col:COMPLETE,Output:["i_item_sk","i_current_price","i_category"] + <-Map 13 [SIMPLE_EDGE] vectorized + SHUFFLE [RS_227] + PartitionCols:_col0 + Select Operator [SEL_226] (rows=40000000 width=90) + Output:["_col0","_col1"] + TableScan [TS_30] (rows=40000000 width=90) + default@customer_address,a,Tbl:COMPLETE,Col:COMPLETE,Output:["ca_address_sk","ca_state"] diff --git a/ql/src/test/results/clientpositive/perf/tez/query23.q.out b/ql/src/test/results/clientpositive/perf/tez/query23.q.out index 059195a890..5c7ec7407c 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query23.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query23.q.out @@ -176,9 +176,9 @@ Stage-0 Reduce Output Operator [RS_598] Group By Operator [GBY_597] (rows=1 width=112) Output:["_col0"],aggregations:["sum(_col0)"] - Select Operator [SEL_595] (rows=1 width=112) + Select Operator [SEL_595] (rows=7 width=112) Output:["_col0"] - Merge Join Operator [MERGEJOIN_594] (rows=1 width=116) + Merge Join Operator [MERGEJOIN_594] (rows=7 width=16) Conds:RS_240._col2=RS_241._col0(Inner),Output:["_col3","_col4"] <-Reducer 11 [SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_240] @@ -424,9 +424,9 @@ Stage-0 Reduce Output Operator [RS_593] Group By Operator [GBY_592] (rows=1 width=112) Output:["_col0"],aggregations:["sum(_col0)"] - Select Operator [SEL_590] (rows=1 width=112) + Select Operator [SEL_590] (rows=13 width=112) Output:["_col0"] - Merge Join Operator [MERGEJOIN_589] (rows=1 width=116) + Merge Join Operator [MERGEJOIN_589] (rows=13 width=8) Conds:RS_118._col1=RS_119._col0(Inner),Output:["_col3","_col4"] <-Reducer 3 [SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_118] diff --git a/ql/src/test/results/clientpositive/perf/tez/query6.q.out b/ql/src/test/results/clientpositive/perf/tez/query6.q.out index 0ca703911f..73c9a3c792 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query6.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query6.q.out @@ -99,7 +99,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_68] (rows=1 width=94) Output:["_col0","_col1"],aggregations:["count()"],keys:_col9 - Merge Join Operator [MERGEJOIN_174] (rows=316 width=86) + Merge Join Operator [MERGEJOIN_174] (rows=500 width=86) Conds:RS_64._col4=RS_213._col0(Inner),Output:["_col9"] <-Map 16 [SIMPLE_EDGE] vectorized SHUFFLE [RS_213]