diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 4b9f292..dad5d5b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -150,28 +150,28 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Statistics parentStats = parent.getStatistics(); AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); + Statistics stats = null; - // SELECT (*) does not change the statistics. Just pass on the parent statistics - if (sop.getConf().isSelectStar()) { + if (parentStats != null) { try { - if (parentStats != null) { - sop.setStatistics(parentStats.clone()); - } + stats = parentStats.clone(); } catch (CloneNotSupportedException e) { throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg()); } - return null; } try { if (satisfyPrecondition(parentStats)) { - Statistics stats = parentStats.clone(); - List colStats = - StatsUtils.getColStatisticsFromExprMap(conf, parentStats, sop.getColumnExprMap(), - sop.getSchema()); - long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats); + // this will take care of mapping between input column names and output column names. The + // returned column stats will have the output column names. + List colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats, + sop.getColumnExprMap(), sop.getSchema()); stats.setColumnStats(colStats); - stats.setDataSize(setMaxIfInvalid(dataSize)); + // in case of select(*) the data size does not change + if (!sop.getConf().isSelectStar()) { + long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats); + stats.setDataSize(setMaxIfInvalid(dataSize)); + } sop.setStatistics(stats); if (isDebugEnabled) { diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index d42ede4..c010e7d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1018,7 +1018,20 @@ public static long getWritableSize(ObjectInspector oi, Object value) { colStat.setColumnName(outColName); colStat.setTableAlias(outTabAlias); } - cs.add(colStat); + if (colStat != null) { + cs.add(colStat); + } + } + + return cs; + } + + // In cases where column expression map or row schema is missing, just pass on the parent column + // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to + // internal names. + if (colExprMap == null || rowSchema == null) { + if (parentStats.getColumnStats() != null) { + cs.addAll(parentStats.getColumnStats()); } } return cs; diff --git ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out index 040dd4e..c91e2f3 100644 --- ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out @@ -337,7 +337,7 @@ STAGE PLANS: alias: s Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (s_store_sk is not null and (s_store_sk > 0)) (type: boolean) + predicate: (s_store_sk > 0) (type: boolean) Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: s_store_sk (type: int) @@ -348,13 +348,13 @@ STAGE PLANS: alias: ss Statistics: Num rows: 1000 Data size: 130523 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (ss_store_sk is not null and (ss_store_sk > 0)) (type: boolean) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + predicate: (ss_store_sk > 0) (type: boolean) + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: ss_store_sk (type: int) sort order: + Map-reduce partition columns: ss_store_sk (type: int) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Join Operator condition map: @@ -363,14 +363,14 @@ STAGE PLANS: 0 {KEY.reducesinkkey0} 1 outputColumnNames: _col0 - Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -655,7 +655,7 @@ STAGE PLANS: alias: s1 Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (s_store_sk is not null and (s_store_sk > 1000)) (type: boolean) + predicate: (s_store_sk > 1000) (type: boolean) Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: s_store_sk (type: int) @@ -666,7 +666,7 @@ STAGE PLANS: alias: s Statistics: Num rows: 12 Data size: 3143 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (s_store_sk is not null and (s_store_sk > 1000)) (type: boolean) + predicate: (s_store_sk > 1000) (type: boolean) Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: s_store_sk (type: int) @@ -677,13 +677,13 @@ STAGE PLANS: alias: ss Statistics: Num rows: 1000 Data size: 130523 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (ss_store_sk is not null and (ss_store_sk > 1000)) (type: boolean) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + predicate: (ss_store_sk > 1000) (type: boolean) + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: ss_store_sk (type: int) sort order: + Map-reduce partition columns: ss_store_sk (type: int) - Statistics: Num rows: 321 Data size: 1236 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 333 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Join Operator condition map: @@ -694,14 +694,14 @@ STAGE PLANS: 1 2 outputColumnNames: _col0 - Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 37 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 37 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 37 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git ql/src/test/results/clientpositive/lateral_view.q.out ql/src/test/results/clientpositive/lateral_view.q.out index 8b87c37..2911a12 100644 --- ql/src/test/results/clientpositive/lateral_view.q.out +++ ql/src/test/results/clientpositive/lateral_view.q.out @@ -142,10 +142,10 @@ STAGE PLANS: Statistics: Num rows: 1000 Data size: 28000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 3 - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -166,10 +166,10 @@ STAGE PLANS: Statistics: Num rows: 1000 Data size: 28000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 3 - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -208,10 +208,10 @@ STAGE PLANS: Select Operator expressions: _col5 (type: int) outputColumnNames: _col5 - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -235,7 +235,7 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -265,10 +265,10 @@ STAGE PLANS: Select Operator expressions: _col5 (type: int) outputColumnNames: _col5 - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -292,7 +292,7 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -339,14 +339,14 @@ STAGE PLANS: Lateral View Forward Statistics: Num rows: 1000 Data size: 24000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 3 Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE @@ -366,11 +366,11 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 3 Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE @@ -394,14 +394,14 @@ STAGE PLANS: Lateral View Forward Statistics: Num rows: 1000 Data size: 24000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 3 Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE @@ -421,11 +421,11 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 48000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 3 Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE diff --git ql/src/test/results/clientpositive/lateral_view_noalias.q.out ql/src/test/results/clientpositive/lateral_view_noalias.q.out index 9b6e9c4..e1445bf 100644 --- ql/src/test/results/clientpositive/lateral_view_noalias.q.out +++ ql/src/test/results/clientpositive/lateral_view_noalias.q.out @@ -28,10 +28,10 @@ STAGE PLANS: Statistics: Num rows: 1000 Data size: 192000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 2 - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -52,10 +52,10 @@ STAGE PLANS: Statistics: Num rows: 1000 Data size: 192000 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 2 - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat