diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 8bba7b6..860ff1a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -18,8 +18,11 @@ package org.apache.hadoop.hive.ql.optimizer.stats.annotation; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; +import java.lang.reflect.Field; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -53,6 +56,7 @@ import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; @@ -72,13 +76,8 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import java.lang.reflect.Field; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.Stack; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; public class StatsRulesProcFactory { @@ -1053,7 +1052,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // statistics object that is combination of statistics from all // relations involved in JOIN Statistics stats = new Statistics(); - Map rowCountParents = new HashMap(); + Map rowCountParents = Maps.newHashMap(); List distinctVals = Lists.newArrayList(); int numParent = parents.size(); Map joinedColStats = Maps.newHashMap(); @@ -1072,7 +1071,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // get the join keys from parent ReduceSink operators for (int pos = 0; pos < parents.size(); pos++) { ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); - + ReduceSinkDesc rsConf = parent.getConf(); Statistics parentStats = parent.getStatistics(); keyExprs = parent.getConf().getOutputKeyColumnNames(); @@ -1083,10 +1082,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // propagated properly. UNION operator does not propagate the table // alias of subqueries properly to expression nodes. Hence union20.q // will have wrong number of rows. - Set tableAliases = StatsUtils.getAllTableAlias(parent.getColumnExprMap()); - for (String tabAlias : tableAliases) { - rowCountParents.put(tabAlias, parentStats.getNumRows()); - } + String tabAlias = String.valueOf(rsConf.getTag()); + rowCountParents.put(tabAlias, parentStats.getNumRows()); rowCounts.add(parentStats.getNumRows()); // compute fully qualified join key column names. this name will be @@ -1094,12 +1091,13 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // TODO: expressions in join condition will be ignored. assign // internal name for expressions and estimate column statistics for expression. List fqCols = StatsUtils.getFullyQualifedReducerKeyNames(keyExprs, - parent.getColumnExprMap()); + parent.getColumnExprMap(), tabAlias); joinKeys.put(pos, fqCols); // get column statistics for all output columns for (ColStatistics cs : parentStats.getColumnStats()) { - joinedColStats.put(cs.getFullyQualifiedColName(), cs); + joinedColStats + .put(StatsUtils.getFullyQualifiedColumnName(tabAlias, cs.getColumnName()), cs); } // since new statistics is derived from all relations involved in @@ -1155,21 +1153,23 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Map colExprMap = jop.getColumnExprMap(); RowSchema rs = jop.getSchema(); List outColStats = Lists.newArrayList(); - Map outInTabAlias = new HashMap(); + Map reversedExpr = jop.getConf().getReversedExprs(); + Map inOutTabAlias = Maps.newHashMap(); for (ColumnInfo ci : rs.getSignature()) { String key = ci.getInternalName(); ExprNodeDesc end = colExprMap.get(key); if (end instanceof ExprNodeColumnDesc) { String colName = ((ExprNodeColumnDesc) end).getColumn(); String tabAlias = ((ExprNodeColumnDesc) end).getTabAlias(); + if (tabAlias == null || tabAlias.isEmpty()) { + tabAlias = String.valueOf(reversedExpr.get(key)); + } + inOutTabAlias.put(tabAlias, ci.getTabAlias()); String fqColName = StatsUtils.getFullyQualifiedColumnName(tabAlias, colName); ColStatistics cs = joinedColStats.get(fqColName); - String outColName = key; - String outTabAlias = ci.getTabAlias(); - outInTabAlias.put(outTabAlias, tabAlias); if (cs != null) { - cs.setColumnName(outColName); - cs.setTableAlias(outTabAlias); + cs.setColumnName(key); + cs.setTableAlias(tabAlias); } outColStats.add(cs); } @@ -1178,7 +1178,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // update join statistics stats.setColumnStats(outColStats); long newRowCount = pkfkInferred ? newNumRows : computeNewRowCount(rowCounts, denom); - updateStatsForJoinType(stats, newRowCount, jop, rowCountParents,outInTabAlias); + updateStatsForJoinType(stats, newRowCount, jop, rowCountParents, inOutTabAlias); jop.setStatistics(stats); if (isDebugEnabled) { @@ -1366,7 +1366,7 @@ private float getSelectivityComplexTree(Operator op) { ReduceSinkOperator rsOp = (ReduceSinkOperator) op; List keys = rsOp.getConf().getOutputKeyColumnNames(); List fqCols = StatsUtils.getFullyQualifedReducerKeyNames(keys, - rsOp.getColumnExprMap()); + rsOp.getColumnExprMap(), null); if (fqCols.size() == 1) { String joinCol = fqCols.get(0); if (rsOp.getStatistics() != null) { @@ -1397,7 +1397,7 @@ private float getSelectivityComplexTree(Operator op) { ReduceSinkOperator rsOp = (ReduceSinkOperator) op; List keys = rsOp.getConf().getOutputKeyColumnNames(); List fqCols = StatsUtils.getFullyQualifedReducerKeyNames(keys, - rsOp.getColumnExprMap()); + rsOp.getColumnExprMap(), null); if (fqCols.size() == 1) { String joinCol = fqCols.get(0); if (rsOp.getStatistics() != null) { @@ -1429,8 +1429,7 @@ private Long getEasedOutDenominator(List distinctVals) { private void updateStatsForJoinType(Statistics stats, long newNumRows, CommonJoinOperator jop, - Map rowCountParents, - Map outInTabAlias) { + Map rowCountParents, Map inOutTabAlias) { if (newNumRows < 0) { LOG.info("STATS-" + jop.toString() + ": Overflow in number of rows." @@ -1447,7 +1446,7 @@ private void updateStatsForJoinType(Statistics stats, long newNumRows, // and stats for columns from 2nd parent should be scaled down by 200x List colStats = stats.getColumnStats(); for (ColStatistics cs : colStats) { - long oldRowCount = rowCountParents.get(outInTabAlias.get(cs.getTableAlias())); + long oldRowCount = rowCountParents.get(cs.getTableAlias()); double ratio = (double) newNumRows / (double) oldRowCount; long oldDV = cs.getCountDistint(); long newDV = oldDV; @@ -1463,6 +1462,11 @@ private void updateStatsForJoinType(Statistics stats, long newNumRows, // TODO: HIVE-5579 will handle different join types cs.setNumNulls(0); cs.setCountDistint(newDV); + + // update the table alias to output table alias + String inTabAlias = cs.getTableAlias(); + String outTabAlias = inOutTabAlias.get(inTabAlias); + cs.setTableAlias(outTabAlias); } stats.setColumnStats(colStats); long newDataSize = StatsUtils diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 30f63a2..190f004 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -18,10 +18,14 @@ package org.apache.hadoop.hive.ql.stats; -import com.google.common.base.Joiner; -import com.google.common.collect.Lists; -import com.google.common.math.DoubleMath; -import com.google.common.math.LongMath; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -86,14 +90,9 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.tez.mapreduce.hadoop.MRJobConfig; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; +import com.google.common.math.LongMath; public class StatsUtils { @@ -1348,10 +1347,12 @@ private static String getFullyQualifiedName(String... names) { * - output key names * @param map * - column expression map + * @param tableAlias + * - use the provided table alias * @return list of fully qualified names */ public static List getFullyQualifedReducerKeyNames(List keyExprs, - Map map) { + Map map, String tableAlias) { List result = Lists.newArrayList(); if (keyExprs != null) { for (String key : keyExprs) { @@ -1367,14 +1368,15 @@ private static String getFullyQualifiedName(String... names) { } if (end instanceof ExprNodeColumnDesc) { ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end; - String tabAlias = encd.getTabAlias(); + String tabAlias = tableAlias == null ? encd.getTabAlias() : tableAlias; result.add(getFullyQualifiedColumnName(tabAlias, colName)); } else if (end instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc enf = (ExprNodeGenericFuncDesc) end; String tabAlias = ""; for (ExprNodeDesc childEnd : enf.getChildren()) { if (childEnd instanceof ExprNodeColumnDesc) { - tabAlias = ((ExprNodeColumnDesc) childEnd).getTabAlias(); + tabAlias = + tableAlias == null ? ((ExprNodeColumnDesc) childEnd).getTabAlias() : tableAlias; break; } } diff --git ql/src/test/results/clientpositive/annotate_stats_join.q.out ql/src/test/results/clientpositive/annotate_stats_join.q.out index 7655f5b..82631bd 100644 --- ql/src/test/results/clientpositive/annotate_stats_join.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join.q.out @@ -202,14 +202,14 @@ STAGE PLANS: 0 _col1 (type: int) 1 _col0 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 41 Data size: 4059 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 41 Data size: 4059 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 41 Data size: 4059 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -276,14 +276,14 @@ STAGE PLANS: 0 _col1 (type: int), _col0 (type: string) 1 _col0 (type: int), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 6 Data size: 594 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 6 Data size: 594 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 6 Data size: 594 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -346,14 +346,14 @@ STAGE PLANS: 0 _col1 (type: int), _col0 (type: string) 1 _col0 (type: int), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 6 Data size: 594 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 6 Data size: 594 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 6 Data size: 594 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6 Data size: 1164 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -420,14 +420,14 @@ STAGE PLANS: 0 _col1 (type: int), _col0 (type: string), _col0 (type: string) 1 _col0 (type: int), _col1 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 11 Data size: 1089 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 2134 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 11 Data size: 1089 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 2134 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 11 Data size: 1089 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 2134 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -513,14 +513,14 @@ STAGE PLANS: 1 _col0 (type: int) 2 _col1 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 1536 Data size: 152064 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string), _col5 (type: string), _col6 (type: int), _col7 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 1536 Data size: 152064 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1536 Data size: 152064 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -604,14 +604,14 @@ STAGE PLANS: 1 _col0 (type: int) 2 _col1 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 47 Data size: 4794 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string), _col5 (type: string), _col6 (type: int), _col7 (type: bigint), _col8 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 47 Data size: 4794 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 47 Data size: 4794 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -696,14 +696,14 @@ STAGE PLANS: 1 _col0 (type: int), _col1 (type: string) 2 _col1 (type: int), _col0 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col3 (type: int), _col4 (type: string), _col5 (type: string), _col6 (type: int), _col7 (type: bigint), _col8 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 - Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out index b104a19..66e0e9f 100644 --- ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out @@ -1001,7 +1001,7 @@ STAGE PLANS: 0 _col0 (type: int) 1 _col0 (type: int) outputColumnNames: _col1 - Statistics: Num rows: 1017 Data size: 4068 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false table: @@ -1017,7 +1017,7 @@ STAGE PLANS: key expressions: _col1 (type: int) sort order: + Map-reduce partition columns: _col1 (type: int) - Statistics: Num rows: 1017 Data size: 4068 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE TableScan alias: s Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE @@ -1041,14 +1041,14 @@ STAGE PLANS: 0 _col1 (type: int) 1 _col0 (type: int) outputColumnNames: _col3 - Statistics: Num rows: 1017 Data size: 4068 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col3 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 1017 Data size: 4068 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1017 Data size: 4068 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 210 Data size: 840 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat