diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java index fc6215a..8d27847 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.ObjectPair; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.MetaStoreUtils.UpdateTableStatsArgs; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.FieldSchema; @@ -199,7 +200,8 @@ public void alterTable(RawStore msdb, Warehouse wh, String dbname, Database db = msdb.getDatabase(newt.getDbName()); // Update table stats. For partitioned table, we update stats in // alterPartition() - MetaStoreUtils.updateUnpartitionedTableStatsFast(db, newt, wh, false, true); + MetaStoreUtils.updateUnpartitionedTableStatsFast(db, newt, wh, + new UpdateTableStatsArgs().setForceRecompute(true)); } // now finally call alter table msdb.alterTable(dbname, name, newt); @@ -291,7 +293,8 @@ public Partition alterPartition(final RawStore msdb, Warehouse wh, final String try { oldPart = msdb.getPartition(dbname, name, new_part.getValues()); if (MetaStoreUtils.requireCalStats(hiveConf, oldPart, new_part, tbl)) { - MetaStoreUtils.updatePartitionStatsFast(new_part, wh, false, true); + MetaStoreUtils.updatePartitionStatsFast(new_part, wh, + new UpdateTableStatsArgs().setForceRecompute(true)); } msdb.alterPartition(dbname, name, new_part.getValues(), new_part); } catch (InvalidObjectException e) { @@ -375,7 +378,8 @@ public Partition alterPartition(final RawStore msdb, Warehouse wh, final String } new_part.getSd().setLocation(newPartLoc); if (MetaStoreUtils.requireCalStats(hiveConf, oldPart, new_part, tbl)) { - MetaStoreUtils.updatePartitionStatsFast(new_part, wh, false, true); + MetaStoreUtils.updatePartitionStatsFast(new_part, wh, + new UpdateTableStatsArgs().setForceRecompute(true)); } msdb.alterPartition(dbname, name, part_vals, new_part); } @@ -441,7 +445,8 @@ public Partition alterPartition(final RawStore msdb, Warehouse wh, final String partValsList.add(tmpPart.getValues()); if (MetaStoreUtils.requireCalStats(hiveConf, oldTmpPart, tmpPart, tbl)) { - MetaStoreUtils.updatePartitionStatsFast(tmpPart, wh, false, true); + MetaStoreUtils.updatePartitionStatsFast(tmpPart, wh, + new UpdateTableStatsArgs().setForceRecompute(true)); } } msdb.alterPartitions(dbname, name, partValsList, new_parts); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java index a47619c..d6ca3bc 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java @@ -64,6 +64,7 @@ import org.apache.hadoop.hive.common.metrics.Metrics; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.MetaStoreUtils.UpdateTableStatsArgs; import org.apache.hadoop.hive.metastore.api.AbortTxnRequest; import org.apache.hadoop.hive.metastore.api.AddPartitionsRequest; import org.apache.hadoop.hive.metastore.api.AddPartitionsResult; @@ -1363,11 +1364,13 @@ private void create_table_core(final RawStore ms, final Table tbl, } if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVESTATSAUTOGATHER) && !MetaStoreUtils.isView(tbl)) { + UpdateTableStatsArgs args = new UpdateTableStatsArgs(); if (tbl.getPartitionKeysSize() == 0) { // Unpartitioned table - MetaStoreUtils.updateUnpartitionedTableStatsFast(db, tbl, wh, madeDir); + args.setMadeDir(madeDir); } else { // Partitioned table with no partitions. - MetaStoreUtils.updateUnpartitionedTableStatsFast(db, tbl, wh, true); + args.setMadeDir(true); } + MetaStoreUtils.updateUnpartitionedTableStatsFast(db, tbl, wh, args); } // set create time @@ -1887,7 +1890,9 @@ private Partition append_partition_common(RawStore ms, String dbName, String tab if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVESTATSAUTOGATHER) && !MetaStoreUtils.isView(tbl)) { - MetaStoreUtils.updatePartitionStatsFast(part, wh, madeDir); + UpdateTableStatsArgs args = new UpdateTableStatsArgs(); + args.setMadeDir(madeDir); + MetaStoreUtils.updatePartitionStatsFast(part, wh, args); } success = ms.addPartition(part); @@ -2309,7 +2314,8 @@ private void initializeAddedPartition( final Table tbl, final PartitionSpecProxy.PartitionIterator part, boolean madeDir) throws MetaException { if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVESTATSAUTOGATHER) && !MetaStoreUtils.isView(tbl)) { - MetaStoreUtils.updatePartitionStatsFast(part, wh, madeDir, false); + MetaStoreUtils.updatePartitionStatsFast(part, wh, + new UpdateTableStatsArgs().setMadeDir(madeDir)); } // set create time diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index 1ac5aff..5b5b654 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -161,15 +161,39 @@ public static boolean containsAllFastStats(Map partParams) { return true; } - public static boolean updateUnpartitionedTableStatsFast(Database db, Table tbl, Warehouse wh, - boolean madeDir) throws MetaException { - return updateUnpartitionedTableStatsFast(db, tbl, wh, madeDir, false); + public static class UpdateTableStatsArgs { + private boolean madeDir, forceRecompute, alterTableSetTblProperties; + + /** + * @param madeDir if true, the directory was just created and can be assumed to be empty + * these parameters set + */ + public UpdateTableStatsArgs setMadeDir(boolean madeDir) { + this.madeDir = madeDir; + return this; + } + + /** + * @param forceRecompute Recompute stats even if the passed Partition already has + */ + public UpdateTableStatsArgs setForceRecompute(boolean forceRecompute) { + this.forceRecompute = forceRecompute; + return this; + } + + /** + * @param alterTableSetTblProperties true if user is setting partition statistics manually + */ + public UpdateTableStatsArgs setAlterTableSetTblProperties(boolean alterTableSetTblProperties) { + this.alterTableSetTblProperties = alterTableSetTblProperties; + return this; + } } public static boolean updateUnpartitionedTableStatsFast(Database db, Table tbl, Warehouse wh, - boolean madeDir, boolean forceRecompute) throws MetaException { + UpdateTableStatsArgs args) throws MetaException { return updateUnpartitionedTableStatsFast(tbl, - wh.getFileStatusesForUnpartitionedTable(db, tbl), madeDir, forceRecompute); + wh.getFileStatusesForUnpartitionedTable(db, tbl), args); } /** @@ -182,27 +206,29 @@ public static boolean updateUnpartitionedTableStatsFast(Database db, Table tbl, * these parameters set * @return true if the stats were updated, false otherwise */ - public static boolean updateUnpartitionedTableStatsFast(Table tbl, - FileStatus[] fileStatus, boolean newDir, boolean forceRecompute) throws MetaException { - + public static boolean updateUnpartitionedTableStatsFast(Table tbl, FileStatus[] fileStatus, + UpdateTableStatsArgs args) + throws MetaException { Map params = tbl.getParameters(); boolean updated = false; - if (forceRecompute || + if (args.forceRecompute || params == null || !containsAllFastStats(params)) { if (params == null) { params = new HashMap(); } - if (!newDir) { + if (!args.madeDir) { // The table location already exists and may contain data. // Let's try to populate those stats that don't require full scan. LOG.info("Updating table stats fast for " + tbl.getTableName()); populateQuickStats(fileStatus, params); LOG.info("Updated size of table " + tbl.getTableName() +" to "+ params.get(StatsSetupConst.TOTAL_SIZE)); if(!params.containsKey(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK)) { - // invalidate stats requiring scan since this is a regular ddl alter case - for (String stat : StatsSetupConst.statsRequireCompute) { - params.put(stat, "-1"); + if (!args.alterTableSetTblProperties) { + // invalidate stats requiring scan since this is a regular ddl alter case + for (String stat : StatsSetupConst.statsRequireCompute) { + params.put(stat, "-1"); + } } params.put(StatsSetupConst.COLUMN_STATS_ACCURATE, StatsSetupConst.FALSE); } else { @@ -267,30 +293,17 @@ public static boolean requireCalStats(Configuration hiveConf, Partition oldPart, return false; } - public static boolean updatePartitionStatsFast(Partition part, Warehouse wh) - throws MetaException { - return updatePartitionStatsFast(part, wh, false, false); - } - - public static boolean updatePartitionStatsFast(Partition part, Warehouse wh, boolean madeDir) - throws MetaException { - return updatePartitionStatsFast(part, wh, madeDir, false); - } - /** * Updates the numFiles and totalSize parameters for the passed Partition by querying * the warehouse if the passed Partition does not already have values for these parameters. * @param part * @param wh - * @param madeDir if true, the directory was just created and can be assumed to be empty - * @param forceRecompute Recompute stats even if the passed Partition already has - * these parameters set * @return true if the stats were updated, false otherwise */ public static boolean updatePartitionStatsFast(Partition part, Warehouse wh, - boolean madeDir, boolean forceRecompute) throws MetaException { + UpdateTableStatsArgs args) throws MetaException { return updatePartitionStatsFast(new PartitionSpecProxy.SimplePartitionWrapperIterator(part), - wh, madeDir, forceRecompute); + wh, args); } /** @@ -304,16 +317,16 @@ public static boolean updatePartitionStatsFast(Partition part, Warehouse wh, * @return true if the stats were updated, false otherwise */ public static boolean updatePartitionStatsFast(PartitionSpecProxy.PartitionIterator part, Warehouse wh, - boolean madeDir, boolean forceRecompute) throws MetaException { + UpdateTableStatsArgs args) throws MetaException { Map params = part.getParameters(); boolean updated = false; - if (forceRecompute || + if (args.forceRecompute || params == null || !containsAllFastStats(params)) { if (params == null) { params = new HashMap(); } - if (!madeDir) { + if (!args.madeDir) { // The partition location already existed and may contain data. Lets try to // populate those statistics that don't require a full scan of the data. LOG.warn("Updating partition stats fast for: " + part.getTableName()); @@ -321,9 +334,11 @@ public static boolean updatePartitionStatsFast(PartitionSpecProxy.PartitionItera populateQuickStats(fileStatus, params); LOG.warn("Updated size to " + params.get(StatsSetupConst.TOTAL_SIZE)); if(!params.containsKey(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK)) { - // invalidate stats requiring scan since this is a regular ddl alter case - for (String stat : StatsSetupConst.statsRequireCompute) { - params.put(stat, "-1"); + if (!args.alterTableSetTblProperties) { + // invalidate stats requiring scan since this is a regular ddl alter case + for (String stat : StatsSetupConst.statsRequireCompute) { + params.put(stat, "-1"); + } } params.put(StatsSetupConst.COLUMN_STATS_ACCURATE, StatsSetupConst.FALSE); } else { diff --git a/ql/src/test/queries/clientpositive/stats_set_stats_manually.q b/ql/src/test/queries/clientpositive/stats_set_stats_manually.q new file mode 100644 index 0000000..c2dd6e2 --- /dev/null +++ b/ql/src/test/queries/clientpositive/stats_set_stats_manually.q @@ -0,0 +1,4 @@ +create table stats1 as select * from src; +describe formatted stats1; +alter table stats1 set tblproperties ('numRows' = '12345', 'rawDataSize' = '5000'); +describe formatted stats1; diff --git a/ql/src/test/results/clientpositive/stats_set_stats_manually.q.out b/ql/src/test/results/clientpositive/stats_set_stats_manually.q.out new file mode 100644 index 0000000..f05c45d --- /dev/null +++ b/ql/src/test/results/clientpositive/stats_set_stats_manually.q.out @@ -0,0 +1,91 @@ +PREHOOK: query: create table stats1 as select * from src +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +PREHOOK: Output: database:default +PREHOOK: Output: default@stats1 +POSTHOOK: query: create table stats1 as select * from src +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: database:default +POSTHOOK: Output: default@stats1 +PREHOOK: query: describe formatted stats1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@stats1 +POSTHOOK: query: describe formatted stats1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@stats1 +# col_name data_type comment + +key string +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 500 + rawDataSize 5312 + totalSize 5812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: alter table stats1 set tblproperties ('numRows' = '12345', 'rawDataSize' = '5000') +PREHOOK: type: ALTERTABLE_PROPERTIES +PREHOOK: Input: default@stats1 +PREHOOK: Output: default@stats1 +POSTHOOK: query: alter table stats1 set tblproperties ('numRows' = '12345', 'rawDataSize' = '5000') +POSTHOOK: type: ALTERTABLE_PROPERTIES +POSTHOOK: Input: default@stats1 +POSTHOOK: Output: default@stats1 +PREHOOK: query: describe formatted stats1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@stats1 +POSTHOOK: query: describe formatted stats1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@stats1 +# col_name data_type comment + +key string +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false +#### A masked pattern was here #### + numFiles 1 + numRows -1 + rawDataSize -1 + totalSize 5812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1