diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index f4dffcd..838d6b1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -18,11 +18,8 @@ package org.apache.hadoop.hive.ql.optimizer.stats.annotation; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Stack; - +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -69,8 +66,10 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde.serdeConstants; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Stack; public class StatsRulesProcFactory { @@ -921,8 +920,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + " #Rows of parents: " + rowCountParents.toString() + ". Denominator: " + denom); } - stats.setNumRows(newRowCount); - stats.setDataSize(StatsUtils.getDataSizeFromColumnStats(newRowCount, outColStats)); + updateStatsForJoinType(stats, newRowCount, true, jop.getConf()); jop.setStatistics(stats); if (LOG.isDebugEnabled()) { @@ -968,6 +966,39 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } + private void updateStatsForJoinType(Statistics stats, long newNumRows, + boolean useColStats, JoinDesc conf) { + long oldRowCount = stats.getNumRows(); + double ratio = (double) newNumRows / (double) oldRowCount; + stats.setNumRows(newNumRows); + + if (useColStats) { + List colStats = stats.getColumnStats(); + for (ColStatistics cs : colStats) { + long oldDV = cs.getCountDistint(); + long newDV = oldDV; + + // if ratio is greater than 1, then number of rows increases. This can happen + // when some operators like GROUPBY duplicates the input rows in which case + // number of distincts should not change. Update the distinct count only when + // the output number of rows is less than input number of rows. + if (ratio <= 1.0) { + newDV = (long) Math.ceil(ratio * oldDV); + } + // Assumes inner join + // TODO: HIVE-5579 will handle different join types + cs.setNumNulls(0); + cs.setCountDistint(newDV); + } + stats.setColumnStats(colStats); + long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats); + stats.setDataSize(newDataSize); + } else { + long newDataSize = (long) (ratio * stats.getDataSize()); + stats.setDataSize(newDataSize); + } + } + private long computeNewRowCount(List rowCountParents, long denom) { double factor = 0.0d; long result = 1; diff --git ql/src/test/results/clientpositive/annotate_stats_join.q.out ql/src/test/results/clientpositive/annotate_stats_join.q.out index fba3664..9a1f32c 100644 --- ql/src/test/results/clientpositive/annotate_stats_join.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join.q.out @@ -551,14 +551,14 @@ STAGE PLANS: 1 {KEY.reducesinkkey0} {VALUE._col0} 2 {VALUE._col0} {KEY.reducesinkkey0} {VALUE._col1} {VALUE._col2} outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col9, _col10, _col11, _col12 - Statistics: Num rows: 47 Data size: 13900 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col5 (type: int), _col6 (type: string), _col9 (type: string), _col10 (type: int), _col11 (type: bigint), _col12 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 47 Data size: 13900 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 47 Data size: 13900 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -631,14 +631,14 @@ STAGE PLANS: 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} 2 {KEY.reducesinkkey1} {KEY.reducesinkkey0} {VALUE._col0} {VALUE._col1} outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col9, _col10, _col11, _col12 - Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col5 (type: int), _col6 (type: string), _col9 (type: string), _col10 (type: int), _col11 (type: bigint), _col12 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git ql/src/test/results/clientpositive/union20.q.out ql/src/test/results/clientpositive/union20.q.out index 450f82b..663d128 100644 --- ql/src/test/results/clientpositive/union20.q.out +++ ql/src/test/results/clientpositive/union20.q.out @@ -130,7 +130,7 @@ STAGE PLANS: 0 {KEY.reducesinkkey0} {VALUE._col0} 1 {KEY.reducesinkkey0} {VALUE._col0} outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 36 Data size: 19584 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 36 Data size: 9792 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3