diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 8bba7b6..5ba5cea 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -18,8 +18,13 @@ package org.apache.hadoop.hive.ql.optimizer.stats.annotation; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; +import java.lang.reflect.Field; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -72,13 +77,8 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import java.lang.reflect.Field; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.Stack; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; public class StatsRulesProcFactory { @@ -1582,6 +1582,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, LimitOperator lop = (LimitOperator) nd; Operator parent = lop.getParentOperators().get(0); Statistics parentStats = parent.getStatistics(); + AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; + HiveConf conf = aspCtx.getConf(); try { long limit = -1; @@ -1589,6 +1591,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (satisfyPrecondition(parentStats)) { Statistics stats = parentStats.clone(); + List colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats, + lop.getColumnExprMap(), lop.getSchema()); + stats.setColumnStats(colStats); // if limit is greater than available rows then do not update // statistics diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 30f63a2..2b16d9d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -18,10 +18,14 @@ package org.apache.hadoop.hive.ql.stats; -import com.google.common.base.Joiner; -import com.google.common.collect.Lists; -import com.google.common.math.DoubleMath; -import com.google.common.math.LongMath; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -86,14 +90,9 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.tez.mapreduce.hadoop.MRJobConfig; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; +import com.google.common.math.LongMath; public class StatsUtils { @@ -1011,6 +1010,7 @@ public static long getWritableSize(ObjectInspector oi, Object value) { Statistics parentStats, Map colExprMap, RowSchema rowSchema) { List cs = Lists.newArrayList(); + if (colExprMap != null && rowSchema != null) { for (ColumnInfo ci : rowSchema.getSignature()) { String outColName = ci.getInternalName(); @@ -1020,8 +1020,29 @@ public static long getWritableSize(ObjectInspector oi, Object value) { if (colStat != null) { colStat.setColumnName(outColName); colStat.setTableAlias(outTabAlias); + cs.add(colStat); } + } + + return cs; + } + + // In cases where column expression map is missing but row schema is present, + // we just pass on the parent column stats with the table alias in the schema. + if (colExprMap == null && rowSchema != null) { + for (ColumnInfo ci : rowSchema.getSignature()) { + String outColName = ci.getInternalName(); + String outTabAlias = ci.getTabAlias(); + ColStatistics colStat = parentStats.getColumnStatisticsFromColName(outColName); if (colStat != null) { + try { + colStat = colStat.clone(); + } catch (CloneNotSupportedException e) { + colStat = null; + } + } + if (colStat != null) { + colStat.setTableAlias(outTabAlias); cs.add(colStat); } } @@ -1029,13 +1050,11 @@ public static long getWritableSize(ObjectInspector oi, Object value) { return cs; } - // In cases where column expression map or row schema is missing, just pass on the parent column + // In cases where column expression map is missing, just pass on the parent column // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to // internal names. - if (colExprMap == null || rowSchema == null) { - if (parentStats.getColumnStats() != null) { - cs.addAll(parentStats.getColumnStats()); - } + if (parentStats.getColumnStats() != null) { + cs.addAll(parentStats.getColumnStats()); } return cs; } diff --git ql/src/test/results/clientpositive/annotate_stats_select.q.out ql/src/test/results/clientpositive/annotate_stats_select.q.out index 8984d02..a4c8c41 100644 --- ql/src/test/results/clientpositive/annotate_stats_select.q.out +++ ql/src/test/results/clientpositive/annotate_stats_select.q.out @@ -1062,17 +1062,17 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: string) outputColumnNames: _col0 - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 178 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 10 - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 178 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), 11.0 (type: double) outputColumnNames: _col0, _col1 - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat