diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 7682791f4d..665ccc5298 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -31,6 +31,7 @@ import java.util.Set; import java.util.Stack; +import net.jpountz.util.SafeUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.Context; @@ -1696,12 +1697,17 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } List distinctVals = Lists.newArrayList(); + + // these ndvs are later used to compuate dangling rows and num of nulls for outer joins + List ndvsDangling= Lists.newArrayList(); long denom = 1; + long denomDangling = 1; if (inferredRowCount == -1) { // failed to infer PK-FK relationship for row count estimation fall-back on default logic // compute denominator max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) // in case of multi-attribute join List perAttrDVs = Lists.newArrayList(); + // go over each predicate for (int idx = 0; idx < numAttr; idx++) { for (Integer i : joinKeys.keySet()) { String col = joinKeys.get(i).get(idx); @@ -1711,19 +1717,27 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } } distinctVals.add(getDenominator(perAttrDVs)); + ndvsDangling.add(getDenominatorForDanglingRows(perAttrDVs)); perAttrDVs.clear(); } if (numAttr > 1 && conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_CORRELATED_MULTI_KEY_JOINS)) { denom = Collections.max(distinctVals); + denomDangling = denom - ndvsDangling.get(distinctVals.indexOf(denom)); } else if (numAttr > numParent) { // To avoid denominator getting larger and aggressively reducing // number of rows, we will ease out denominator. denom = StatsUtils.addWithExpDecay(distinctVals); + denomDangling = denom - StatsUtils.addWithExpDecay(ndvsDangling); } else { for (Long l : distinctVals) { denom = StatsUtils.safeMult(denom, l); } + long tempDenom = 1; + for (Long l : ndvsDangling) { + tempDenom = StatsUtils.safeMult(tempDenom, l); + } + denomDangling = denom - tempDenom; } } @@ -1754,15 +1768,14 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // update join statistics stats.setColumnStats(outColStats); - // reason we compute interim row count, where join type isn't considered, is because later - // it will be used to estimate num nulls long interimRowCount = inferredRowCount != -1 ? inferredRowCount : computeRowCountAssumingInnerJoin(rowCounts, denom, jop); // final row computation will consider join type long joinRowCount = inferredRowCount != -1 ? inferredRowCount : computeFinalRowCount(rowCounts, interimRowCount, jop); - updateColStats(conf, stats, interimRowCount, joinRowCount, jop, rowCountParents); + long danglingRows = computeRowCountAssumingInnerJoin(rowCounts, denomDangling, jop) - joinRowCount; + updateColStats(conf, stats, danglingRows, joinRowCount, jop, rowCountParents); // evaluate filter expression and update statistics if (joinRowCount != -1 && jop.getConf().getNoOuterJoin() && @@ -2178,11 +2191,11 @@ private void updateNumNulls(ColStatistics colStats, long interimNumRows, long ne && interimNumRows != newNumRows) { // interim row count can not be less due to containment // assumption in join cardinality computation - assert (newNumRows > interimNumRows); + //assert (newNumRows > interimNumRows); if (isJoinKey(colStats.getColumnName(), jop.getConf().getJoinKeys())) { - newNumNulls = Math.min(newNumRows, (newNumRows - interimNumRows)); + newNumNulls = Math.min(newNumRows, (interimNumRows)); } else { - newNumNulls = Math.min(newNumRows, oldNumNulls + (newNumRows - interimNumRows)); + newNumNulls = Math.min(newNumRows, oldNumNulls + (interimNumRows)); } } break; @@ -2194,20 +2207,21 @@ private void updateNumNulls(ColStatistics colStats, long interimNumRows, long ne // assumption in join cardinality computation // interimNumRows represent number of matches for join keys on two sides. // newNumRows-interimNumRows represent number of non-matches. - assert (newNumRows > interimNumRows); + //assert (newNumRows > interimNumRows); if (isJoinKey(colStats.getColumnName(), jop.getConf().getJoinKeys())) { - newNumNulls = Math.min(newNumRows, (newNumRows - interimNumRows)); + newNumNulls = Math.min(newNumRows, ( interimNumRows)); } else { - newNumNulls = Math.min(newNumRows, oldNumNulls + (newNumRows - interimNumRows)); + // TODO: oldNumNulls should be scaled instead of taken as it is + newNumNulls = Math.min(newNumRows, oldNumNulls + (interimNumRows)); } } break; case JoinDesc.FULL_OUTER_JOIN: if (isJoinKey(colStats.getColumnName(), jop.getConf().getJoinKeys())) { - newNumNulls = Math.min(newNumRows, (newNumRows - interimNumRows)); + newNumNulls = Math.min(newNumRows, (interimNumRows)); } else { - newNumNulls = Math.min(newNumRows, oldNumNulls + (newNumRows - interimNumRows)); + newNumNulls = Math.min(newNumRows, oldNumNulls + (interimNumRows)); } break; @@ -2379,6 +2393,39 @@ private void updateJoinColumnsNDV(Map> joinKeys, } } + private long getDenominatorForDanglingRows(List distinctVals) { + + if (distinctVals.isEmpty()) { + return 2; + } + + // simple join from 2 relations: denom = min(v1, v2) + if (distinctVals.size() <= 2) { + return Collections.min(distinctVals); + } else { + + // remember max value and ignore it from the denominator + long maxNDV = distinctVals.get(0); + int maxIdx = 0; + + for (int i = 1; i < distinctVals.size(); i++) { + if (distinctVals.get(i) > maxNDV) { + maxNDV = distinctVals.get(i); + maxIdx = i; + } + } + + // join from multiple relations: + // denom = Product of all NDVs except the greatest of all + long denom = 1; + for (int i = 0; i < distinctVals.size(); i++) { + if (i != maxIdx) { + denom = StatsUtils.safeMult(denom, distinctVals.get(i)); + } + } + return denom; + } + } private long getDenominator(List distinctVals) { if (distinctVals.isEmpty()) {