diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 84ee78f..e05513b 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -652,14 +652,11 @@ HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10), // to accurately compute statistics for GROUPBY map side parallelism needs to be known HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1), - // statistics annotation fetches column statistics for all required columns and for all - // required partitions which can be very expensive sometimes + // statistics annotation fetches column statistics for all required columns which can + // be very expensive sometimes HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false), - // in the absence of table/partition stats, average row size will be used to - // estimate the number of rows/data size - HIVE_STATS_AVG_ROW_SIZE("hive.stats.avg.row.size", 10000), // in the absence of column statistics, the estimated number of rows/data size that will - // emitted from join operator will depend on t factor + // be emitted from join operator will depend on this factor HIVE_STATS_JOIN_FACTOR("hive.stats.join.factor", (float) 1.1), // in the absence of uncompressed/raw data size, total file size will be used for statistics // annotation. But the file may be compressed, encoded and serialized which may be lesser in size diff --git conf/hive-default.xml.template conf/hive-default.xml.template index 66d22f9..bc66a6a 100644 --- conf/hive-default.xml.template +++ conf/hive-default.xml.template @@ -1322,6 +1322,102 @@ + hive.stats.max.variable.length + 100 + + To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.), + average row size is multiplied with the total number of rows coming out of each operator. + Average row size is computed from average column size of all columns in the row. In the absence + of column statistics, for variable length columns (like string, bytes etc.), this value will be + used. For fixed length columns their corresponding Java equivalent sizes are used + (float - 4 bytes, double - 8 bytes etc.). + + + + + hive.stats.list.num.entries + 10 + + To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.), + average row size is multiplied with the total number of rows coming out of each operator. + Average row size is computed from average column size of all columns in the row. In the absence + of column statistics and for variable length complex columns like list, the average number of + entries/values can be specified using this config. + + + + + hive.stats.map.num.entries + 10 + + To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.), + average row size is multiplied with the total number of rows coming out of each operator. + Average row size is computed from average column size of all columns in the row. In the absence + of column statistics and for variable length complex columns like map, the average number of + entries/values can be specified using this config. + + + + + hive.stats.map.parallelism + 1 + + Hive/Tez optimizer estimates the data size flowing through each of the operators. + For GROUPBY operator, to accurately compute the data size map-side parallelism needs to + be known. By default, this value is set to 1 since optimizer is not aware of the number of + mappers during compile-time. This Hive config can be used to specify the number of mappers + to be used for data size computation of GROUPBY operator. + + + + + hive.stats.fetch.column.stats + false + + Annotation of operator tree with statistics information requires column statisitcs. + Column statistics are fetched from metastore. Fetching column statistics for each needed column + can be expensive when the number of columns is high. This flag can be used to disable fetching + of column statistics from metastore. + + + + + hive.stats.fetch.partition.stats + true + + Annotation of operator tree with statistics information requires partition level basic + statisitcs like number of rows, data size and file size. Partition statistics are fetched from + metastore. Fetching partition statistics for each needed partition can be expensive when the + number of partitions is high. This flag can be used to disable fetching of partition statistics + from metastore. When this flag is disabled, Hive will make calls to filesystem to get file sizes + and will estimate the number of rows from row schema. + + + + + hive.stats.join.factor + 1.1 + + Hive/Tez optimizer estimates the data size flowing through each of the operators. JOIN operator + uses column statistics to estimate the number of rows flowing out of it and hence the data size. + In the absence of column statistics, this factor determines the amount of rows that flows out + of JOIN operator. + + + + + hive.stats.deserialization.factor + 1.0 + + Hive/Tez optimizer estimates the data size flowing through each of the operators. In the absence + of basic statistics like number of rows and data size, file size is used to estimate the number + of rows and data size. Since files in tables/partitions are serialized (and optionally + compressed) the estimates of number of rows and data size cannot be reliably determined. + This factor is multiplied with the file size to account for serialization and compression. + + + + hive.support.concurrency false Whether Hive supports concurrency or not. A ZooKeeper instance must be up and running for the default Hive lock manager to support read-write locks. diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index d03a760..069f7ae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -988,9 +988,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (limit <= parentStats.getNumRows()) { long numRows = limit; long avgRowSize = parentStats.getAvgRowSize(); - if (avgRowSize <= 0) { - avgRowSize = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_AVG_ROW_SIZE); - } long dataSize = avgRowSize * limit; wcStats.setNumRows(numRows); wcStats.setDataSize(dataSize);