From 6b9c6a01ab6fa0e6fbda9ecd0dbc2ef1f8774b11 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Tue, 12 Dec 2017 16:13:30 -0800 Subject: [PATCH] HIVE-18201 : Disable XPROD_EDGE for sq_count_check() created for scalar subqueries --- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 2 + .../hive/ql/optimizer/ConvertJoinMapJoin.java | 27 ++++++---- ql/src/test/queries/clientpositive/perf/query6.q | 2 + .../results/clientpositive/perf/tez/query6.q.out | 63 +++++++++++----------- 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index dc31505a44..b13f8411a6 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1487,6 +1487,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "However, if it is on, and the predicted number of entries in hashtable for a given join \n" + "input is larger than this number, the join will not be converted to a mapjoin. \n" + "The value \"-1\" means no limit."), + XPRODSMALLTABLEROWSTHRESHOLD("hive.xprod.mapjoin.small.table.rows", 1,"Maximum number of rows on build side" + + " of map join before it switches over to cross product edge"), HIVECONVERTJOINMAXSHUFFLESIZE("hive.auto.convert.join.shuffle.max.size", 10000000L, "If hive.auto.convert.join.noconditionaltask is off, this parameter does not take affect. \n" + "However, if it is on, and the predicted size of the larger input for a given join is greater \n" + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java index 0c6e1e0288..963c1cccf1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java @@ -102,14 +102,6 @@ MemoryMonitorInfo memoryMonitorInfo = getMemoryMonitorInfo(maxSize, context.conf); joinOp.getConf().setMemoryMonitorInfo(memoryMonitorInfo); - // not use map join in case of cross product - boolean cartesianProductEdgeEnabled = - HiveConf.getBoolVar(context.conf, HiveConf.ConfVars.TEZ_CARTESIAN_PRODUCT_EDGE_ENABLED); - if (cartesianProductEdgeEnabled && !hasOuterJoin(joinOp) && isCrossProduct(joinOp)) { - fallbackToMergeJoin(joinOp, context); - return null; - } - TezBucketJoinProcCtx tezBucketJoinProcCtx = new TezBucketJoinProcCtx(context.conf); boolean hiveConvertJoin = context.conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN) & !context.parseContext.getDisableMapJoin(); @@ -805,6 +797,23 @@ public int getMapJoinConversionPos(JoinOperator joinOp, OptimizeTezProcContext c return -1; } + // only allow cross product in map joins if build side is 'small' + boolean cartesianProductEdgeEnabled = + HiveConf.getBoolVar(context.conf, HiveConf.ConfVars.TEZ_CARTESIAN_PRODUCT_EDGE_ENABLED); + if (cartesianProductEdgeEnabled && !hasOuterJoin(joinOp) && isCrossProduct(joinOp)) { + for (int i = 0 ; i < joinOp.getParentOperators().size(); i ++) { + if (i != bigTablePosition) { + Statistics parentStats = joinOp.getParentOperators().get(i).getStatistics(); + if (parentStats.getNumRows() > + HiveConf.getIntVar(context.conf, HiveConf.ConfVars.XPRODSMALLTABLEROWSTHRESHOLD)) { + // if any of smaller side is estimated to generate more than + // threshold rows we would disable mapjoin + return -1; + } + } + } + } + // We store the total memory that this MapJoin is going to use, // which is calculated as totalSize/buckets, with totalSize // equal to sum of small tables size. @@ -1223,7 +1232,7 @@ private static long estimateNDV(long numRows, List columnStats) { n = StatsUtils.safeMult(n, ndv); } } - final double nn = (double) n; + final double nn = n; final double a = (nn - 1d) / nn; if (a == 1d) { // A under-flows if nn is large. diff --git a/ql/src/test/queries/clientpositive/perf/query6.q b/ql/src/test/queries/clientpositive/perf/query6.q index d45045d135..aabce5202e 100644 --- a/ql/src/test/queries/clientpositive/perf/query6.q +++ b/ql/src/test/queries/clientpositive/perf/query6.q @@ -1,3 +1,5 @@ +set hive.auto.convert.join=true; +set hive.tez.cartesian-product.enabled=true; set hive.mapred.mode=nonstrict; -- start query 1 in stream 0 using template query6.tpl and seed 1819994127 explain diff --git a/ql/src/test/results/clientpositive/perf/tez/query6.q.out b/ql/src/test/results/clientpositive/perf/tez/query6.q.out index f1e47581b4..72d0d9976b 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query6.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query6.q.out @@ -1,4 +1,4 @@ -Warning: Shuffle Join MERGEJOIN[111][tables = [$hdt$_5, $hdt$_6]] in Stage 'Reducer 12' is a cross product +Warning: Map Join MAPJOIN[111][bigTable=?] in task 'Reducer 17' is a cross product PREHOOK: query: explain select a.ca_state state, count(*) cnt from customer_address a @@ -54,14 +54,13 @@ Plan optimized by CBO. Vertex dependency in root stage Reducer 10 <- Map 8 (SIMPLE_EDGE) Reducer 11 <- Reducer 10 (CUSTOM_SIMPLE_EDGE) -Reducer 12 <- Reducer 11 (CUSTOM_SIMPLE_EDGE), Reducer 19 (CUSTOM_SIMPLE_EDGE) -Reducer 13 <- Map 20 (SIMPLE_EDGE), Reducer 12 (SIMPLE_EDGE) -Reducer 16 <- Map 15 (SIMPLE_EDGE), Map 17 (SIMPLE_EDGE) -Reducer 19 <- Map 18 (SIMPLE_EDGE) +Reducer 14 <- Map 13 (SIMPLE_EDGE), Map 15 (SIMPLE_EDGE) +Reducer 17 <- Map 16 (SIMPLE_EDGE), Reducer 11 (BROADCAST_EDGE) +Reducer 18 <- Map 19 (SIMPLE_EDGE), Reducer 17 (SIMPLE_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 9 (ONE_TO_ONE_EDGE) -Reducer 3 <- Map 14 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Reducer 16 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Reducer 13 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 3 <- Map 12 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Reducer 14 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Reducer 18 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) Reducer 7 <- Reducer 6 (SIMPLE_EDGE) Reducer 9 <- Map 8 (SIMPLE_EDGE) @@ -89,7 +88,7 @@ Stage-0 Output:["_col0","_col1"],aggregations:["count()"],keys:_col9 Merge Join Operator [MERGEJOIN_114] (rows=766650239 width=88) Conds:RS_64._col4=RS_65._col0(Inner),Output:["_col9"] - <-Reducer 13 [SIMPLE_EDGE] + <-Reducer 18 [SIMPLE_EDGE] SHUFFLE [RS_65] PartitionCols:_col0 Select Operator [SEL_54] (rows=169400 width=1436) @@ -98,7 +97,7 @@ Stage-0 predicate:(_col5 > (1.2 * CASE WHEN (_col1 is null) THEN (null) ELSE (_col0) END)) Merge Join Operator [MERGEJOIN_112] (rows=508200 width=1436) Conds:RS_50._col2=RS_51._col2(Inner),Output:["_col0","_col1","_col4","_col5"] - <-Map 20 [SIMPLE_EDGE] + <-Map 19 [SIMPLE_EDGE] SHUFFLE [RS_51] PartitionCols:_col2 Select Operator [SEL_46] (rows=462000 width=1436) @@ -107,13 +106,13 @@ Stage-0 predicate:i_item_sk is not null TableScan [TS_44] (rows=462000 width=1436) default@item,i,Tbl:COMPLETE,Col:NONE,Output:["i_item_sk","i_current_price","i_category"] - <-Reducer 12 [SIMPLE_EDGE] + <-Reducer 17 [SIMPLE_EDGE] SHUFFLE [RS_50] PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_111] (rows=231000 width=1445) + Map Join Operator [MAPJOIN_111] (rows=231000 width=1445) Conds:(Inner),Output:["_col0","_col1","_col2"] - <-Reducer 11 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_48] + <-Reducer 11 [BROADCAST_EDGE] + BROADCAST [RS_48] Select Operator [SEL_43] (rows=1 width=8) Filter Operator [FIL_42] (rows=1 width=8) predicate:(sq_count_check(_col0) <= 1) @@ -137,32 +136,30 @@ Stage-0 predicate:((d_moy = 2) and (d_year = 2000)) TableScan [TS_3] (rows=73049 width=1119) default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_month_seq","d_year","d_moy"] - <-Reducer 19 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_47] - Select Operator [SEL_29] (rows=231000 width=1436) - Output:["_col0","_col1","_col2"] - Group By Operator [GBY_28] (rows=231000 width=1436) - Output:["_col0","_col1"],aggregations:["avg(VALUE._col0)"],keys:KEY._col0 - <-Map 18 [SIMPLE_EDGE] - SHUFFLE [RS_27] - PartitionCols:_col0 - Group By Operator [GBY_26] (rows=462000 width=1436) - Output:["_col0","_col1"],aggregations:["avg(i_current_price)"],keys:i_category - Filter Operator [FIL_105] (rows=462000 width=1436) - predicate:i_category is not null - TableScan [TS_23] (rows=462000 width=1436) - default@item,j,Tbl:COMPLETE,Col:NONE,Output:["i_current_price","i_category"] + <-Select Operator [SEL_29] (rows=231000 width=1436) + Output:["_col0","_col1","_col2"] + Group By Operator [GBY_28] (rows=231000 width=1436) + Output:["_col0","_col1"],aggregations:["avg(VALUE._col0)"],keys:KEY._col0 + <-Map 16 [SIMPLE_EDGE] + SHUFFLE [RS_27] + PartitionCols:_col0 + Group By Operator [GBY_26] (rows=462000 width=1436) + Output:["_col0","_col1"],aggregations:["avg(i_current_price)"],keys:i_category + Filter Operator [FIL_105] (rows=462000 width=1436) + predicate:i_category is not null + TableScan [TS_23] (rows=462000 width=1436) + default@item,j,Tbl:COMPLETE,Col:NONE,Output:["i_current_price","i_category"] <-Reducer 4 [SIMPLE_EDGE] SHUFFLE [RS_64] PartitionCols:_col4 Merge Join Operator [MERGEJOIN_113] (rows=696954748 width=88) Conds:RS_61._col5=RS_62._col0(Inner),Output:["_col4","_col9"] - <-Reducer 16 [SIMPLE_EDGE] + <-Reducer 14 [SIMPLE_EDGE] SHUFFLE [RS_62] PartitionCols:_col0 Merge Join Operator [MERGEJOIN_110] (rows=88000001 width=860) Conds:RS_19._col1=RS_20._col0(Inner),Output:["_col0","_col3"] - <-Map 15 [SIMPLE_EDGE] + <-Map 13 [SIMPLE_EDGE] SHUFFLE [RS_19] PartitionCols:_col1 Select Operator [SEL_15] (rows=80000000 width=860) @@ -171,7 +168,7 @@ Stage-0 predicate:(c_current_addr_sk is not null and c_customer_sk is not null) TableScan [TS_13] (rows=80000000 width=860) default@customer,c,Tbl:COMPLETE,Col:NONE,Output:["c_customer_sk","c_current_addr_sk"] - <-Map 17 [SIMPLE_EDGE] + <-Map 15 [SIMPLE_EDGE] SHUFFLE [RS_20] PartitionCols:_col0 Select Operator [SEL_18] (rows=40000000 width=1014) @@ -185,7 +182,7 @@ Stage-0 PartitionCols:_col5 Merge Join Operator [MERGEJOIN_109] (rows=633595212 width=88) Conds:RS_58._col0=RS_59._col0(Inner),Output:["_col4","_col5"] - <-Map 14 [SIMPLE_EDGE] + <-Map 12 [SIMPLE_EDGE] SHUFFLE [RS_59] PartitionCols:_col0 Select Operator [SEL_12] (rows=575995635 width=88) -- 2.14.3 (Apple Git-98)