diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index ed20069..346e24c 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1532,8 +1532,6 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "The Java class (implementing the StatsAggregator interface) that is used by default if hive.stats.dbclass is custom type."), HIVE_STATS_ATOMIC("hive.stats.atomic", false, "whether to update metastore stats only if all stats are available"), - HIVE_STATS_COLLECT_RAWDATASIZE("hive.stats.collect.rawdatasize", true, - "should the raw data size be collected when analyzing tables"), CLIENT_STATS_COUNTERS("hive.client.stats.counters", "", "Subset of counters that should be of interest for hive.client.stats.publishers (when one wants to limit their publishing). \n" + "Non-display names should be used"), diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 1ab914d..d29f183 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1139,7 +1139,6 @@ spark.query.files=add_part_multiple.q, \ stats16.q, \ stats18.q, \ stats2.q, \ - stats20.q, \ stats3.q, \ stats5.q, \ stats6.q, \ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index 3ec63ee..8bfb68a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -111,7 +111,6 @@ protected transient int maxPartitions; protected transient ListBucketingCtx lbCtx; protected transient boolean isSkewedStoredAsSubDirectories; - protected transient boolean statsCollectRawDataSize; protected transient boolean[] statsFromRecordWriter; protected transient boolean isCollectRWStats; private transient FSPaths prevFsp; @@ -360,7 +359,6 @@ protected void initializeOp(Configuration hconf) throws HiveException { } isCompressed = conf.getCompressed(); parent = Utilities.toTempPath(conf.getDirName()); - statsCollectRawDataSize = conf.isStatsCollectRawDataSize(); statsFromRecordWriter = new boolean[numFiles]; serializer = (Serializer) conf.getTableInfo().getDeserializerClass().newInstance(); serializer.initialize(hconf, conf.getTableInfo().getProperties()); @@ -732,11 +730,9 @@ public void process(Object row, int tag) throws HiveException { // of gathering stats isCollectRWStats = areAllTrue(statsFromRecordWriter); if (conf.isGatherStats() && !isCollectRWStats) { - if (statsCollectRawDataSize) { - SerDeStats stats = serializer.getSerDeStats(); - if (stats != null) { - fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize()); - } + SerDeStats stats = serializer.getSerDeStats(); + if (stats != null) { + fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize()); } fpaths.stat.addToStat(StatsSetupConst.ROW_COUNT, 1); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java index abcded4..3549143 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java @@ -88,9 +88,7 @@ private VirtualColumn(String name, TypeInfo typeInfo, boolean isHidden, ObjectIn public static List getStatsRegistry(Configuration conf) { List l = new ArrayList(); - if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_RAWDATASIZE)) { - l.add(RAWDATASIZE); - } + l.add(RAWDATASIZE); return l; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index ce52d4b..5f8bbad 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -6975,10 +6975,6 @@ protected Operator genFileSinkPlan(String dest, QB qb, Operator input) } fileSinkDesc.setLbCtx(lbCtx); - // set it in plan instead of runtime in FileSinkOperator - fileSinkDesc.setStatsCollectRawDataSize(HiveConf.getBoolVar(conf, - HiveConf.ConfVars.HIVE_STATS_COLLECT_RAWDATASIZE)); - // set the stats publishing/aggregating key prefix // the same as directory name. The directory name // can be changed in the optimizer but the key should not be changed diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java index 0064fca..07ed4fd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java @@ -88,8 +88,6 @@ private ListBucketingCtx lbCtx; private String statsTmpDir; - private boolean statsCollectRawDataSize; - // Record what type of write this is. Default is non-ACID (ie old style). private AcidUtils.Operation writeType = AcidUtils.Operation.NOT_ACID; private long txnId = 0; // transaction id for this operation @@ -153,7 +151,6 @@ public Object clone() throws CloneNotSupportedException { ret.setParentDir(parentDir); ret.setLinkedFileSinkDesc(linkedFileSinkDesc); ret.setStatsReliable(statsReliable); - ret.setStatsCollectRawDataSize(statsCollectRawDataSize); ret.setDpSortState(dpSortState); ret.setWriteType(writeType); ret.setTransactionId(txnId); @@ -416,14 +413,6 @@ public void setLinkedFileSinkDesc(List linkedFileSinkDesc) { this.linkedFileSinkDesc = linkedFileSinkDesc; } - public boolean isStatsCollectRawDataSize() { - return statsCollectRawDataSize; - } - - public void setStatsCollectRawDataSize(boolean statsCollectRawDataSize) { - this.statsCollectRawDataSize = statsCollectRawDataSize; - } - public boolean isRemovedReduceSinkBucketSort() { return removedReduceSinkBucketSort; } diff --git a/ql/src/test/queries/clientpositive/stats20.q b/ql/src/test/queries/clientpositive/stats20.q deleted file mode 100644 index 79fd2b8..0000000 --- a/ql/src/test/queries/clientpositive/stats20.q +++ /dev/null @@ -1,18 +0,0 @@ -set hive.stats.autogather=true; -set datanucleus.cache.collections=false; - -set hive.stats.collect.rawdatasize=true; -CREATE TABLE stats_partitioned(key string, value string) partitioned by (ds string); -insert overwrite table stats_partitioned partition (ds='1') -select * from src; --- rawDataSize is 5312 after config is turned on -describe formatted stats_partitioned; -describe formatted stats_partitioned partition (ds='1'); - -set hive.stats.collect.rawdatasize=false; -insert overwrite table stats_partitioned partition (ds='1') -select * from src; --- rawDataSize is 0 after config is turned off -describe formatted stats_partitioned; -describe formatted stats_partitioned partition (ds='1'); - diff --git a/ql/src/test/results/clientpositive/spark/stats20.q.out b/ql/src/test/results/clientpositive/spark/stats20.q.out deleted file mode 100644 index a824bc9..0000000 --- a/ql/src/test/results/clientpositive/spark/stats20.q.out +++ /dev/null @@ -1,184 +0,0 @@ -PREHOOK: query: CREATE TABLE stats_partitioned(key string, value string) partitioned by (ds string) -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@stats_partitioned -POSTHOOK: query: CREATE TABLE stats_partitioned(key string, value string) partitioned by (ds string) -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: database:default -POSTHOOK: Output: default@stats_partitioned -PREHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- rawDataSize is 5312 after config is turned on -describe formatted stats_partitioned -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: -- rawDataSize is 5312 after config is turned on -describe formatted stats_partitioned -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Table Information -Database: default -#### A masked pattern was here #### -Retention: 0 -#### A masked pattern was here #### -Table Type: MANAGED_TABLE -Table Parameters: -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: describe formatted stats_partitioned partition (ds='1') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: describe formatted stats_partitioned partition (ds='1') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Partition Information -Partition Value: [1] -Database: default -Table: stats_partitioned -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- rawDataSize is 0 after config is turned off -describe formatted stats_partitioned -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: -- rawDataSize is 0 after config is turned off -describe formatted stats_partitioned -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Table Information -Database: default -#### A masked pattern was here #### -Retention: 0 -#### A masked pattern was here #### -Table Type: MANAGED_TABLE -Table Parameters: -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: describe formatted stats_partitioned partition (ds='1') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: describe formatted stats_partitioned partition (ds='1') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Partition Information -Partition Value: [1] -Database: default -Table: stats_partitioned -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 diff --git a/ql/src/test/results/clientpositive/stats20.q.out b/ql/src/test/results/clientpositive/stats20.q.out deleted file mode 100644 index a824bc9..0000000 --- a/ql/src/test/results/clientpositive/stats20.q.out +++ /dev/null @@ -1,184 +0,0 @@ -PREHOOK: query: CREATE TABLE stats_partitioned(key string, value string) partitioned by (ds string) -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@stats_partitioned -POSTHOOK: query: CREATE TABLE stats_partitioned(key string, value string) partitioned by (ds string) -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: database:default -POSTHOOK: Output: default@stats_partitioned -PREHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- rawDataSize is 5312 after config is turned on -describe formatted stats_partitioned -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: -- rawDataSize is 5312 after config is turned on -describe formatted stats_partitioned -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Table Information -Database: default -#### A masked pattern was here #### -Retention: 0 -#### A masked pattern was here #### -Table Type: MANAGED_TABLE -Table Parameters: -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: describe formatted stats_partitioned partition (ds='1') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: describe formatted stats_partitioned partition (ds='1') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Partition Information -Partition Value: [1] -Database: default -Table: stats_partitioned -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: query: insert overwrite table stats_partitioned partition (ds='1') -select * from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_partitioned@ds=1 -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_partitioned PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- rawDataSize is 0 after config is turned off -describe formatted stats_partitioned -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: -- rawDataSize is 0 after config is turned off -describe formatted stats_partitioned -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Table Information -Database: default -#### A masked pattern was here #### -Retention: 0 -#### A masked pattern was here #### -Table Type: MANAGED_TABLE -Table Parameters: -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: describe formatted stats_partitioned partition (ds='1') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_partitioned -POSTHOOK: query: describe formatted stats_partitioned partition (ds='1') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_partitioned -# col_name data_type comment - -key string -value string - -# Partition Information -# col_name data_type comment - -ds string - -# Detailed Partition Information -Partition Value: [1] -Database: default -Table: stats_partitioned -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1