diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index b51f7a8..4c79275 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; @@ -114,6 +115,7 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa // column level statistics are required only for the columns that are needed List schema = tableScanOperator.getSchema().getSignature(); List neededColumns = tableScanOperator.getNeededColumns(); + List referencedColumns = tableScanOperator.getReferencedColumns(); boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS); boolean fetchPartStats = @@ -207,7 +209,6 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa stats.getBasicStatsState().equals(State.COMPLETE)) { stats.setBasicStatsState(State.PARTIAL); } - boolean haveFullStats = fetchColStats; if (fetchColStats) { List partNames = new ArrayList(partList.getNotDeniedPartns().size()); for (Partition part : partList.getNotDeniedPartns()) { @@ -215,37 +216,89 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa } Map colToTabAlias = new HashMap(); neededColumns = processNeededColumns(schema, neededColumns, colToTabAlias); - AggrStats aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames); + AggrStats aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), + table.getTableName(), neededColumns, partNames); if (null == aggrStats) { - haveFullStats = false; + // There are some partitions with no state (or we didn't fetch any state). + // Update the stats with empty list to reflect that in the + // state/initialize structures. + List emptyStats = Lists.newArrayList(); + + // add partition column stats + addParitionColumnStats(neededColumns, referencedColumns, schema, + table, partList, emptyStats); + + stats.addToColumnStats(emptyStats); + stats.updateColumnStatsState( + deriveStatType(emptyStats, referencedColumns)); } else { List colStats = aggrStats.getColStats(); if (colStats.size() != neededColumns.size()) { - LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to retrieve" + LOG.debug("Column stats requested for : " + neededColumns.size() + + " columns. Able to retrieve" + " for " + colStats.size() + " columns"); } - List columnStats = convertColStats(colStats, table.getTableName(), colToTabAlias); + List columnStats = convertColStats(colStats, + table.getTableName(), colToTabAlias); + + addParitionColumnStats(neededColumns, referencedColumns, schema, + table, partList, columnStats); + stats.addToColumnStats(columnStats); - State colState = deriveStatType(columnStats, neededColumns); - if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) { - LOG.debug("Column stats requested for : " + partNames.size() +" partitions. " - + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions"); + State colState = deriveStatType(columnStats, referencedColumns); + if (aggrStats.getPartsFound() != partNames.size() && + colState != State.NONE) { + LOG.debug("Column stats requested for : " + partNames.size() + + " partitions. " + + "Able to retrieve for " + aggrStats.getPartsFound() + + " partitions"); colState = State.PARTIAL; } stats.setColumnStatsState(colState); } } - // There are some partitions with no state (or we didn't fetch any state). - // Update the stats with empty list to reflect that in the state/initialize structures. - if (!haveFullStats) { - List emptyStats = Lists.newArrayList(); - stats.addToColumnStats(emptyStats); - stats.updateColumnStatsState(deriveStatType(emptyStats, neededColumns)); - } } return stats; } + private static void addParitionColumnStats(List neededColumns, + List referencedColumns, List schema, Table table, + PrunedPartitionList partList, List colStats) + throws HiveException { + + // extra columns is difference between referenced columns vs needed + // columns. The difference could be partition columns. + List extraCols = Lists.newArrayList(referencedColumns); + if (referencedColumns.size() > neededColumns.size()) { + extraCols.removeAll(neededColumns); + for (String col : extraCols) { + for (ColumnInfo ci : schema) { + // conditions for being partition column + if (col.equals(ci.getInternalName()) && ci.getIsVirtualCol() && + !ci.isHiddenVirtualCol()) { + // currently metastore does not store column stats for + // partition column, so we calculate the NDV from pruned + // partition list + ColStatistics partCS = new ColStatistics(table.getTableName(), + ci.getInternalName(), ci.getType().getTypeName()); + long numPartitions = getNDVPartitionColumn(partList.getPartitions(), + ci.getInternalName()); + partCS.setCountDistint(numPartitions); + colStats.add(partCS); + } + } + } + } + } + + public static int getNDVPartitionColumn(Set partitions, String partColName) { + Set distinctVals = new HashSet(partitions.size()); + for (Partition partition : partitions) { + distinctVals.add(partition.getSpec().get(partColName)); + } + return distinctVals.size(); + } + private static void setUnknownRcDsToAverage( List rowCounts, List dataSizes, int avgRowSize) { if (LOG.isDebugEnabled()) { diff --git ql/src/test/queries/clientpositive/annotate_stats_part.q ql/src/test/queries/clientpositive/annotate_stats_part.q index f25776a..fcfe566 100644 --- ql/src/test/queries/clientpositive/annotate_stats_part.q +++ ql/src/test/queries/clientpositive/annotate_stats_part.q @@ -65,6 +65,9 @@ explain select zip from loc_orc; -- basicStatState: COMPLETE colStatState: PARTIAL explain select state from loc_orc; +-- basicStatState: COMPLETE colStatState: COMPLETE +explain select year from loc_orc; + -- column statistics for __HIVE_DEFAULT_PARTITION__ is not supported yet. Hence colStatState reports PARTIAL -- basicStatState: COMPLETE colStatState: PARTIAL explain select state,locid from loc_orc; diff --git ql/src/test/results/clientpositive/annotate_stats_part.q.out ql/src/test/results/clientpositive/annotate_stats_part.q.out index 6262d37..2f4f9c5 100644 --- ql/src/test/results/clientpositive/annotate_stats_part.q.out +++ ql/src/test/results/clientpositive/annotate_stats_part.q.out @@ -56,11 +56,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: PARTIAL ListSink PREHOOK: query: insert overwrite table loc_orc partition(year) select * from loc_staging @@ -98,11 +98,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 5 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 5 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 5 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 5 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- partition level analyze statistics for specific parition @@ -135,11 +135,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 2 Data size: 325 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2 Data size: 325 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 325 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2 Data size: 325 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- basicStatState: PARTIAL colStatState: NONE @@ -158,11 +158,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 9 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 9 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 9 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 9 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- basicStatState: COMPLETE colStatState: NONE @@ -181,11 +181,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 7 Data size: 399 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 7 Data size: 399 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 7 Data size: 399 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 7 Data size: 399 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- partition level analyze statistics for all partitions @@ -222,11 +222,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 1 Data size: 325 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 325 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 1 Data size: 325 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 325 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- basicStatState: COMPLETE colStatState: NONE @@ -245,11 +245,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- basicStatState: COMPLETE colStatState: NONE @@ -268,11 +268,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: -- both partitions will be pruned @@ -293,14 +293,14 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: PARTIAL Filter Operator predicate: ((year = '2001') and (year = '__HIVE_DEFAULT_PARTITION__')) (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), '__HIVE_DEFAULT_PARTITION__' (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: PARTIAL ListSink PREHOOK: query: -- partition level partial column statistics @@ -385,6 +385,41 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: -- basicStatState: COMPLETE colStatState: COMPLETE +explain select year from loc_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: COMPLETE colStatState: COMPLETE +explain select year from loc_orc +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 724 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: year (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 8 Data size: 1472 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 8 Data size: 1472 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + PREHOOK: query: -- column statistics for __HIVE_DEFAULT_PARTITION__ is not supported yet. Hence colStatState reports PARTIAL -- basicStatState: COMPLETE colStatState: PARTIAL explain select state,locid from loc_orc @@ -572,7 +607,7 @@ STAGE PLANS: Statistics: Num rows: 7 Data size: 399 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid > 0) (type: boolean) - Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: locid (type: int), year (type: string) outputColumnNames: _col0, _col1 @@ -608,14 +643,14 @@ STAGE PLANS: Statistics: Num rows: 7 Data size: 399 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid > 0) (type: boolean) - Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: locid (type: int), year (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat