diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 1feb1fd..1556de9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -150,28 +150,28 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Statistics parentStats = parent.getStatistics(); AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); + Statistics stats = null; - // SELECT (*) does not change the statistics. Just pass on the parent statistics - if (sop.getConf().isSelectStar()) { + if (parentStats != null) { try { - if (parentStats != null) { - sop.setStatistics(parentStats.clone()); - } + stats = parentStats.clone(); } catch (CloneNotSupportedException e) { throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg()); } - return null; } try { if (satisfyPrecondition(parentStats)) { - Statistics stats = parentStats.clone(); - List colStats = - StatsUtils.getColStatisticsFromExprMap(conf, parentStats, sop.getColumnExprMap(), - sop.getSchema()); - long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats); + // this will take care of mapping between input column names and output column names. The + // returned column stats will have the output column names. + List colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats, + sop.getColumnExprMap(), sop.getSchema()); stats.setColumnStats(colStats); - stats.setDataSize(setMaxIfInvalid(dataSize)); + // in case of select(*) the data size does not change + if (!sop.getConf().isSelectStar()) { + long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats); + stats.setDataSize(setMaxIfInvalid(dataSize)); + } sop.setStatistics(stats); if (isDebugEnabled) { diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index d42ede4..87dbe6f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1020,6 +1020,15 @@ public static long getWritableSize(ObjectInspector oi, Object value) { } cs.add(colStat); } + + return cs; + } + + // In cases where column expression map or row schema is missing, just pass on the parent column + // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to + // internal names. + if (colExprMap == null || rowSchema == null) { + cs.addAll(parentStats.getColumnStats()); } return cs; } diff --git ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out index 040dd4e..c91e2f3 100644 --- ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out @@ -337,7 +337,7 @@ STAGE PLANS: alias: s Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (s_store_sk is not null and (s_store_sk > 0)) (type: boolean) + predicate: (s_store_sk > 0) (type: boolean) Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: s_store_sk (type: int) @@ -348,13 +348,13 @@ STAGE PLANS: alias: ss Statistics: Num rows: 1000 Data size: 130523 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (ss_store_sk is not null and (ss_store_sk > 0)) (type: boolean) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + predicate: (ss_store_sk > 0) (type: boolean) + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: ss_store_sk (type: int) sort order: + Map-reduce partition columns: ss_store_sk (type: int) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Join Operator condition map: @@ -363,14 +363,14 @@ STAGE PLANS: 0 {KEY.reducesinkkey0} 1 outputColumnNames: _col0 - Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -655,7 +655,7 @@ STAGE PLANS: alias: s1 Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (s_store_sk is not null and (s_store_sk > 1000)) (type: boolean) + predicate: (s_store_sk > 1000) (type: boolean) Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: s_store_sk (type: int) @@ -666,7 +666,7 @@ STAGE PLANS: alias: s Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (s_store_sk is not null and (s_store_sk > 1000)) (type: boolean) + predicate: (s_store_sk > 1000) (type: boolean) Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: s_store_sk (type: int) @@ -677,13 +677,13 @@ STAGE PLANS: alias: ss Statistics: Num rows: 1000 Data size: 130523 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (ss_store_sk is not null and (ss_store_sk > 1000)) (type: boolean) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + predicate: (ss_store_sk > 1000) (type: boolean) + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: ss_store_sk (type: int) sort order: + Map-reduce partition columns: ss_store_sk (type: int) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Join Operator condition map: @@ -694,14 +694,14 @@ STAGE PLANS: 1 2 outputColumnNames: _col0 - Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 37 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 37 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 37 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat