diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index bdb09a8..d9f70a7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -320,8 +320,13 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, long newNumRows = 0; Statistics andStats = null; - if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) + if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) { + if (isDebugEnabled) { + LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() + + " Original data size: " + stats.getDataSize() + " New num rows: 1"); + } return 1; + } if (pred instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred; @@ -378,23 +383,31 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) { ColStatistics cs = stats.getColumnStatisticsFromColName(colName); if (cs != null) { - return cs.getNumTrues(); + newNumRows = cs.getNumTrues(); + } else { + // default + newNumRows = stats.getNumRows() / 2; } + } else { + // if not boolean column return half the number of rows + newNumRows = stats.getNumRows() / 2; } - - // if not boolean column return half the number of rows - return stats.getNumRows() / 2; } else if (pred instanceof ExprNodeConstantDesc) { // special case for handling false constants ExprNodeConstantDesc encd = (ExprNodeConstantDesc) pred; if (Boolean.FALSE.equals(encd.getValue())) { - return 0; + newNumRows = 0; } else { - return stats.getNumRows(); + newNumRows = stats.getNumRows(); } } + if (isDebugEnabled) { + LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() + + " New num rows: " + newNumRows); + } + return newNumRows; } @@ -476,15 +489,16 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr } // 3. Calculate IN selectivity - float factor = 1; + double factor = 1d; for (int i = 0; i < columnStats.size(); i++) { long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint(); - // ( num of distinct vals for col / num of rows ) * num of distinct vals for col in IN clause - float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * values.get(i).size(); - factor *= columnFactor; + // (num of distinct vals for col in IN clause / num of distinct vals for col ) + double columnFactor = dvs == 0 ? 0.5d : ((double) values.get(i).size() / dvs); + // max can be 1, even when ndv is larger in IN clause than in column stats + factor *= columnFactor > 1d ? 1d : columnFactor; } float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR); - return Math.round( (double)numRows * factor * inFactor); + return Math.round( (double) numRows * factor * inFactor); } private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, @@ -1828,11 +1842,11 @@ private void updateColStats(HiveConf conf, Statistics stats, long newNumRows, Map rowCountParents) { if (newNumRows < 0) { - LOG.info("STATS-" + jop.toString() + ": Overflow in number of rows." + LOG.debug("STATS-" + jop.toString() + ": Overflow in number of rows. " + newNumRows + " rows will be set to Long.MAX_VALUE"); } if (newNumRows == 0) { - LOG.info("STATS-" + jop.toString() + ": Equals 0 in number of rows." + LOG.debug("STATS-" + jop.toString() + ": Equals 0 in number of rows. " + newNumRows + " rows will be set to 1"); newNumRows = 1; } @@ -2252,12 +2266,12 @@ static void updateStats(Statistics stats, long newNumRows, boolean updateNDV) { if (newNumRows < 0) { - LOG.info("STATS-" + op.toString() + ": Overflow in number of rows." + LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. " + newNumRows + " rows will be set to Long.MAX_VALUE"); newNumRows = StatsUtils.getMaxIfOverflow(newNumRows); } if (newNumRows == 0) { - LOG.info("STATS-" + op.toString() + ": Equals 0 in number of rows." + LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. " + newNumRows + " rows will be set to 1"); newNumRows = 1; } diff --git ql/src/test/results/clientpositive/llap/explainuser_2.q.out ql/src/test/results/clientpositive/llap/explainuser_2.q.out index 67f960a..e898111 100644 --- ql/src/test/results/clientpositive/llap/explainuser_2.q.out +++ ql/src/test/results/clientpositive/llap/explainuser_2.q.out @@ -298,29 +298,29 @@ Stage-0 Stage-1 Reducer 5 llap File Output Operator [FS_55] - Limit [LIM_54] (rows=24 width=285) + Limit [LIM_54] (rows=14 width=285) Number of rows:100 - Select Operator [SEL_53] (rows=24 width=285) + Select Operator [SEL_53] (rows=14 width=285) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] <-Reducer 4 [SIMPLE_EDGE] llap SHUFFLE [RS_52] - Group By Operator [GBY_50] (rows=24 width=285) + Group By Operator [GBY_50] (rows=14 width=285) Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"],keys:KEY._col0, KEY._col1, KEY._col2 <-Reducer 3 [SIMPLE_EDGE] llap SHUFFLE [RS_49] PartitionCols:_col0, _col1, _col2 - Group By Operator [GBY_48] (rows=24 width=285) + Group By Operator [GBY_48] (rows=14 width=285) Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(_col13)","count(_col21)","count(_col3)"],keys:_col12, _col20, _col2 - Select Operator [SEL_47] (rows=650 width=534) + Select Operator [SEL_47] (rows=363 width=534) Output:["_col12","_col20","_col2","_col13","_col21","_col3"] - Merge Join Operator [MERGEJOIN_97] (rows=650 width=534) + Merge Join Operator [MERGEJOIN_97] (rows=363 width=534) Conds:RS_44._col1, _col3=RS_45._col15, _col17(Inner),Output:["_col2","_col3","_col12","_col13","_col20","_col21"] <-Reducer 11 [SIMPLE_EDGE] llap SHUFFLE [RS_45] PartitionCols:_col15, _col17 - Select Operator [SEL_40] (rows=190 width=447) + Select Operator [SEL_40] (rows=180 width=447) Output:["_col14","_col15","_col17","_col6","_col7"] - Merge Join Operator [MERGEJOIN_96] (rows=190 width=447) + Merge Join Operator [MERGEJOIN_96] (rows=180 width=447) Conds:RS_37._col6, _col4=RS_38._col4, _col2(Inner),Output:["_col2","_col3","_col14","_col15","_col17"] <-Reducer 10 [SIMPLE_EDGE] llap SHUFFLE [RS_37] @@ -376,7 +376,7 @@ Stage-0 <-Reducer 16 [SIMPLE_EDGE] llap SHUFFLE [RS_38] PartitionCols:_col4, _col2 - Merge Join Operator [MERGEJOIN_95] (rows=19 width=356) + Merge Join Operator [MERGEJOIN_95] (rows=18 width=356) Conds:RS_24._col0=RS_25._col0(Inner),Output:["_col2","_col3","_col4","_col5"] <-Map 15 [SIMPLE_EDGE] llap SHUFFLE [RS_24] @@ -390,16 +390,16 @@ Stage-0 <-Map 17 [SIMPLE_EDGE] llap SHUFFLE [RS_25] PartitionCols:_col0 - Select Operator [SEL_23] (rows=500 width=178) + Select Operator [SEL_23] (rows=7 width=178) Output:["_col0"] - Filter Operator [FIL_90] (rows=500 width=178) + Filter Operator [FIL_90] (rows=7 width=178) predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null) TableScan [TS_21] (rows=500 width=178) default@src,d2,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] <-Reducer 2 [SIMPLE_EDGE] llap SHUFFLE [RS_44] PartitionCols:_col1, _col3 - Merge Join Operator [MERGEJOIN_91] (rows=414 width=269) + Merge Join Operator [MERGEJOIN_91] (rows=99 width=269) Conds:RS_41._col0=RS_42._col0(Inner),Output:["_col1","_col2","_col3"] <-Map 1 [SIMPLE_EDGE] llap SHUFFLE [RS_41] @@ -413,9 +413,9 @@ Stage-0 <-Map 6 [SIMPLE_EDGE] llap SHUFFLE [RS_42] PartitionCols:_col0 - Select Operator [SEL_5] (rows=500 width=178) + Select Operator [SEL_5] (rows=7 width=178) Output:["_col0"] - Filter Operator [FIL_84] (rows=500 width=178) + Filter Operator [FIL_84] (rows=7 width=178) predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null) TableScan [TS_3] (rows=500 width=178) default@src,d3,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] diff --git ql/src/test/results/clientpositive/llap/vectorization_0.q.out ql/src/test/results/clientpositive/llap/vectorization_0.q.out index 67fcdaa..af0bad9 100644 --- ql/src/test/results/clientpositive/llap/vectorization_0.q.out +++ ql/src/test/results/clientpositive/llap/vectorization_0.q.out @@ -30777,19 +30777,19 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (cstring1) IN ('biology', 'history', 'topology') (type: boolean) - Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 470 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() keys: cstring1 (type: string) mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) null sort order: a sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col1 (type: bigint) auto parallelism: true @@ -30855,16 +30855,16 @@ STAGE PLANS: keys: KEY._col0 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col1 (type: bigint), _col0 (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col1 (type: string) null sort order: a sort order: + - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col0 (type: bigint) auto parallelism: false @@ -30875,13 +30875,13 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: bigint), KEY.reducesinkkey0 (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat diff --git ql/src/test/results/clientpositive/remove_exprs_stats.q.out ql/src/test/results/clientpositive/remove_exprs_stats.q.out index 8fe688d..567e6b2 100644 --- ql/src/test/results/clientpositive/remove_exprs_stats.q.out +++ ql/src/test/results/clientpositive/remove_exprs_stats.q.out @@ -449,14 +449,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid) IN (5) (type: boolean) - Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -485,14 +485,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid) IN (5, 2, 3) (type: boolean) - Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -521,14 +521,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid) IN (1, 6) (type: boolean) - Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat