diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 357a474..0d5f0db 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -150,28 +150,28 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Statistics parentStats = parent.getStatistics(); AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); + Statistics stats = null; - // SELECT (*) does not change the statistics. Just pass on the parent statistics - if (sop.getConf().isSelectStar()) { + if (parentStats != null) { try { - if (parentStats != null) { - sop.setStatistics(parentStats.clone()); - } + stats = parentStats.clone(); } catch (CloneNotSupportedException e) { throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg()); } - return null; } try { if (satisfyPrecondition(parentStats)) { - Statistics stats = parentStats.clone(); - List colStats = - StatsUtils.getColStatisticsFromExprMap(conf, parentStats, sop.getColumnExprMap(), - sop.getSchema()); - long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats); + // this will take care of mapping between input column names and output column names. The + // returned column stats will have the output column names. + List colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats, + sop.getColumnExprMap(), sop.getSchema()); stats.setColumnStats(colStats); - stats.setDataSize(setMaxIfInvalid(dataSize)); + // in case of select(*) the data size does not change + if (!sop.getConf().isSelectStar() && !sop.getConf().isSelStarNoCompute()) { + long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats); + stats.setDataSize(setMaxIfInvalid(dataSize)); + } sop.setStatistics(stats); if (isDebugEnabled) { @@ -1711,6 +1711,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { Operator op = (Operator) nd; OperatorDesc conf = op.getConf(); + AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; + HiveConf hconf = aspCtx.getConf(); if (conf != null) { Statistics stats = conf.getStatistics(); @@ -1727,7 +1729,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, stats.addToNumRows(parentStats.getNumRows()); stats.addToDataSize(parentStats.getDataSize()); stats.updateColumnStatsState(parentStats.getColumnStatsState()); - stats.addToColumnStats(parentStats.getColumnStats()); + List colStats = StatsUtils.getColStatisticsFromExprMap(hconf, + parentStats, op.getColumnExprMap(), op.getSchema()); + stats.addToColumnStats(colStats); op.getConf().setStatistics(stats); if (isDebugEnabled) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 974f03b..af43d99 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -9755,14 +9755,25 @@ private Operator genLateralViewPlan(QB qb, Operator op, ASTNode lateralViewTree) throws SemanticException { RowResolver lvForwardRR = new RowResolver(); RowResolver source = opParseCtx.get(op).getRowResolver(); + Map lvfColExprMap = new HashMap(); + Map selColExprMap = new HashMap(); + List colList = new ArrayList(); + List colNames = new ArrayList(); for (ColumnInfo col : source.getColumnInfos()) { String[] tabCol = source.reverseLookup(col.getInternalName()); lvForwardRR.put(tabCol[0], tabCol[1], col); + ExprNodeDesc colExpr = new ExprNodeColumnDesc(col.getType(), col.getInternalName(), + col.getTabAlias(), false); + colList.add(colExpr); + colNames.add(colExpr.getName()); + lvfColExprMap.put(col.getInternalName(), colExpr); + selColExprMap.put(col.getInternalName(), colExpr.clone()); } Operator lvForward = putOpInsertMap(OperatorFactory.getAndMakeChild( new LateralViewForwardDesc(), new RowSchema(lvForwardRR.getColumnInfos()), op), lvForwardRR); + lvForward.setColumnExprMap(lvfColExprMap); // The order in which the two paths are added is important. The // lateral view join operator depends on having the select operator @@ -9771,9 +9782,12 @@ private Operator genLateralViewPlan(QB qb, Operator op, ASTNode lateralViewTree) // Get the all path by making a select(*). RowResolver allPathRR = opParseCtx.get(lvForward).getRowResolver(); // Operator allPath = op; + SelectDesc sDesc = new SelectDesc(colList, colNames, false); + sDesc.setSelStarNoCompute(true); Operator allPath = putOpInsertMap(OperatorFactory.getAndMakeChild( - new SelectDesc(true), new RowSchema(allPathRR.getColumnInfos()), + sDesc, new RowSchema(allPathRR.getColumnInfos()), lvForward), allPathRR); + allPath.setColumnExprMap(selColExprMap); int allColumns = allPathRR.getColumnInfos().size(); // Get the UDTF Path QB blankQb = new QB(null, null, false); diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index d10eddf..1b27c31 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1020,7 +1020,20 @@ public static long getWritableSize(ObjectInspector oi, Object value) { colStat.setColumnName(outColName); colStat.setTableAlias(outTabAlias); } - cs.add(colStat); + if (colStat != null) { + cs.add(colStat); + } + } + + return cs; + } + + // In cases where column expression map or row schema is missing, just pass on the parent column + // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to + // internal names. + if (colExprMap == null || rowSchema == null) { + if (parentStats.getColumnStats() != null) { + cs.addAll(parentStats.getColumnStats()); } } return cs; diff --git ql/src/test/results/clientpositive/annotate_stats_groupby.q.out ql/src/test/results/clientpositive/annotate_stats_groupby.q.out index 1459b44..41a0083 100644 --- ql/src/test/results/clientpositive/annotate_stats_groupby.q.out +++ ql/src/test/results/clientpositive/annotate_stats_groupby.q.out @@ -177,17 +177,17 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: int) mode: mergepartial outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 4 Data size: 400 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 800 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: _col0 (type: string), _col1 (type: int), _col2 (type: bigint) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 4 Data size: 400 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 800 Basic stats: COMPLETE Column stats: PARTIAL Group By Operator aggregations: min(_col1) keys: _col0 (type: string), _col2 (type: bigint) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 4 Data size: 416 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 832 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator compressed: false table: @@ -203,7 +203,7 @@ STAGE PLANS: key expressions: _col0 (type: string), _col1 (type: bigint) sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint) - Statistics: Num rows: 4 Data size: 416 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 832 Basic stats: COMPLETE Column stats: PARTIAL value expressions: _col2 (type: int) Reduce Operator Tree: Group By Operator @@ -211,14 +211,14 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: bigint) mode: mergepartial outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 208 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 832 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: int) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 2 Data size: 208 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 832 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 208 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 832 Basic stats: COMPLETE Column stats: PARTIAL table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -852,14 +852,14 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: bigint) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 2 Data size: 172 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 4 Data size: 344 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: _col0 (type: string), _col1 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 2 Data size: 172 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 4 Data size: 344 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 172 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 4 Data size: 344 Basic stats: COMPLETE Column stats: PARTIAL table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git ql/src/test/results/clientpositive/annotate_stats_groupby2.q.out ql/src/test/results/clientpositive/annotate_stats_groupby2.q.out index f991191..2f85c92 100644 --- ql/src/test/results/clientpositive/annotate_stats_groupby2.q.out +++ ql/src/test/results/clientpositive/annotate_stats_groupby2.q.out @@ -274,25 +274,25 @@ STAGE PLANS: keys: state (type: string), votes (type: bigint) mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 10 Data size: 860 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 688 Basic stats: COMPLETE Column stats: PARTIAL Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: bigint) sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint) - Statistics: Num rows: 10 Data size: 860 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 688 Basic stats: COMPLETE Column stats: PARTIAL Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: bigint) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 5 Data size: 430 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 2 Data size: 172 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: _col0 (type: string), _col1 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 5 Data size: 430 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 2 Data size: 172 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator compressed: false - Statistics: Num rows: 5 Data size: 430 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 2 Data size: 172 Basic stats: COMPLETE Column stats: PARTIAL table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat diff --git ql/src/test/results/clientpositive/lateral_view.q.out ql/src/test/results/clientpositive/lateral_view.q.out index 8b87c37..dafbf64 100644 --- ql/src/test/results/clientpositive/lateral_view.q.out +++ ql/src/test/results/clientpositive/lateral_view.q.out @@ -139,13 +139,13 @@ STAGE PLANS: Select Operator expressions: _col5 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 1000 Data size: 28000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Limit Number of rows: 3 - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -163,13 +163,13 @@ STAGE PLANS: Select Operator expressions: _col5 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 1000 Data size: 28000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Limit Number of rows: 3 - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -208,10 +208,10 @@ STAGE PLANS: Select Operator expressions: _col5 (type: int) outputColumnNames: _col5 - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -235,7 +235,7 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -265,10 +265,10 @@ STAGE PLANS: Select Operator expressions: _col5 (type: int) outputColumnNames: _col5 - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -292,7 +292,7 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col5, _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col5 (type: int), _col6 (type: string) outputColumnNames: _col0, _col1 @@ -339,10 +339,10 @@ STAGE PLANS: Lateral View Forward Statistics: Num rows: 1000 Data size: 24000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 @@ -366,7 +366,7 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 @@ -394,10 +394,10 @@ STAGE PLANS: Lateral View Forward Statistics: Num rows: 1000 Data size: 24000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 @@ -421,7 +421,7 @@ STAGE PLANS: function name: explode Lateral View Join Operator outputColumnNames: _col6 - Statistics: Num rows: 2000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 2000 Data size: 268000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col6 (type: int) outputColumnNames: _col0 diff --git ql/src/test/results/clientpositive/lateral_view_noalias.q.out ql/src/test/results/clientpositive/lateral_view_noalias.q.out index 9b6e9c4..b2a22f1 100644 --- ql/src/test/results/clientpositive/lateral_view_noalias.q.out +++ ql/src/test/results/clientpositive/lateral_view_noalias.q.out @@ -25,13 +25,13 @@ STAGE PLANS: Select Operator expressions: _col5 (type: string), _col6 (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 1000 Data size: 192000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Limit Number of rows: 2 - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -49,13 +49,13 @@ STAGE PLANS: Select Operator expressions: _col5 (type: string), _col6 (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 1000 Data size: 192000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Limit Number of rows: 2 - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat