diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 7fdba60..987ecc2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -20,6 +20,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -811,6 +812,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // 2 relations, multiple attributes boolean multiAttr = false; int numAttr = 1; + int numParent = parents.size(); Map joinedColStats = Maps.newHashMap(); Map> joinKeys = Maps.newHashMap(); @@ -873,12 +875,19 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, perAttrDVs.add(cs.getCountDistint()); } } + distinctVals.add(getDenominator(perAttrDVs)); perAttrDVs.clear(); } - for (Long l : distinctVals) { - denom *= l; + if (numAttr > numParent) { + // To avoid denominator getting larger and aggressively reducing + // number of rows, we will ease out denominator. + denom = getEasedOutDenominator(distinctVals); + } else { + for (Long l : distinctVals) { + denom *= l; + } } } else { for (List jkeys : joinKeys.values()) { @@ -983,6 +992,20 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } + private Long getEasedOutDenominator(List distinctVals) { + // Exponential back-off for NDVs. + // 1) Descending order sort of NDVs + // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * .... + Collections.sort(distinctVals, Collections.reverseOrder()); + + long denom = distinctVals.get(0); + for (int i = 1; i < distinctVals.size(); i++) { + denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i))); + } + + return denom; + } + private void updateStatsForJoinType(Statistics stats, long newNumRows, JoinDesc conf, Map rowCountParents, Map outInTabAlias) { diff --git ql/src/test/results/clientpositive/annotate_stats_join.q.out ql/src/test/results/clientpositive/annotate_stats_join.q.out index 3529ed3..c9ad41d 100644 --- ql/src/test/results/clientpositive/annotate_stats_join.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join.q.out @@ -391,17 +391,17 @@ STAGE PLANS: 0 {KEY.reducesinkkey1} {KEY.reducesinkkey0} {VALUE._col0} 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} outputColumnNames: _col0, _col1, _col2, _col6, _col7 - Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 2134 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (((_col1 = _col6) and (_col0 = _col7)) and (_col7 = _col0)) (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col6 (type: int), _col7 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat