diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 2d0417a..42cbc14 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -19,8 +19,10 @@ package org.apache.hadoop.hive.ql.optimizer.stats.annotation; import java.lang.reflect.Field; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -34,7 +36,6 @@ import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.GroupByOperator; -import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.LimitOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; @@ -1472,7 +1473,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // update join statistics stats.setColumnStats(outColStats); long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom, jop); - updateColStats(stats, newRowCount, jop, rowCountParents); + updateColStats(conf, stats, newRowCount, jop, rowCountParents); jop.setStatistics(stats); if (isDebugEnabled) { @@ -1766,7 +1767,7 @@ private float getSelectivityComplexTree(Operator op) { return result; } - private void updateColStats(Statistics stats, long newNumRows, + private void updateColStats(HiveConf conf, Statistics stats, long newNumRows, CommonJoinOperator jop, Map rowCountParents) { @@ -1789,7 +1790,9 @@ private void updateColStats(Statistics stats, long newNumRows, // stats for columns from 1st parent should be scaled down by 200/10 = 20x // and stats for columns from 2nd parent should be scaled down by 200x List colStats = stats.getColumnStats(); + Set colNameStatsAvailable = new HashSet<>(); for (ColStatistics cs : colStats) { + colNameStatsAvailable.add(cs.getColumnName()); int pos = jop.getConf().getReversedExprs().get(cs.getColumnName()); long oldRowCount = rowCountParents.get(pos); double ratio = (double) newNumRows / (double) oldRowCount; @@ -1811,6 +1814,17 @@ private void updateColStats(Statistics stats, long newNumRows, stats.setColumnStats(colStats); long newDataSize = StatsUtils .getDataSizeFromColumnStats(newNumRows, colStats); + // Add default size for columns for which stats were not available + List neededColumns = new ArrayList<>(); + for (String colName : jop.getSchema().getColumnNames()) { + if (!colNameStatsAvailable.contains(colName)) { + neededColumns.add(colName); + } + } + if (neededColumns.size() != 0) { + int restColumnsDefaultSize = StatsUtils.estimateRowSizeFromSchema(conf, jop.getSchema().getSignature(), neededColumns); + newDataSize = StatsUtils.safeAdd(newDataSize, StatsUtils.safeMult(restColumnsDefaultSize, newNumRows)); + } stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize)); } diff --git ql/src/test/queries/clientpositive/stats_partial_size.q ql/src/test/queries/clientpositive/stats_partial_size.q new file mode 100644 index 0000000..c42d351 --- /dev/null +++ ql/src/test/queries/clientpositive/stats_partial_size.q @@ -0,0 +1,8 @@ +set hive.stats.fetch.column.stats=true; + +create table sample_partitioned (x int) partitioned by (y int); +insert into sample_partitioned partition(y=1) values (1),(2); +create temporary table sample as select * from sample_partitioned; +analyze table sample compute statistics for columns; + +explain select sample_partitioned.x from sample_partitioned, sample where sample.y = sample_partitioned.y; diff --git ql/src/test/results/clientpositive/stats_partial_size.q.out ql/src/test/results/clientpositive/stats_partial_size.q.out new file mode 100644 index 0000000..31adec7 --- /dev/null +++ ql/src/test/results/clientpositive/stats_partial_size.q.out @@ -0,0 +1,100 @@ +PREHOOK: query: create table sample_partitioned (x int) partitioned by (y int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sample_partitioned +POSTHOOK: query: create table sample_partitioned (x int) partitioned by (y int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sample_partitioned +PREHOOK: query: insert into sample_partitioned partition(y=1) values (1),(2) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@sample_partitioned@y=1 +POSTHOOK: query: insert into sample_partitioned partition(y=1) values (1),(2) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@sample_partitioned@y=1 +POSTHOOK: Lineage: sample_partitioned PARTITION(y=1).x EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: create temporary table sample as select * from sample_partitioned +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@sample_partitioned +PREHOOK: Input: default@sample_partitioned@y=1 +PREHOOK: Output: database:default +PREHOOK: Output: default@sample +POSTHOOK: query: create temporary table sample as select * from sample_partitioned +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@sample_partitioned +POSTHOOK: Input: default@sample_partitioned@y=1 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sample +PREHOOK: query: analyze table sample compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@sample +#### A masked pattern was here #### +POSTHOOK: query: analyze table sample compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sample +#### A masked pattern was here #### +PREHOOK: query: explain select sample_partitioned.x from sample_partitioned, sample where sample.y = sample_partitioned.y +PREHOOK: type: QUERY +POSTHOOK: query: explain select sample_partitioned.x from sample_partitioned, sample where sample.y = sample_partitioned.y +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: sample_partitioned + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: x (type: int), y (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + value expressions: _col0 (type: int) + TableScan + alias: sample + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: y is not null (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: y (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +