diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java index 0652b9d..47eef3b 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.EnvironmentContext; @@ -237,9 +238,8 @@ public void alterTable(RawStore msdb, Warehouse wh, String dbname, // alterPartition() MetaStoreUtils.updateTableStatsFast(db, newt, wh, false, true, environmentContext); } - updateTableColumnStatsForAlterTable(msdb, oldt, newt); - // now finally call alter table - msdb.alterTable(dbname, name, newt); + + alterTableUpdateTableColumnStats(msdb, oldt, newt); // commit the changes success = msdb.commitTransaction(); } catch (InvalidObjectException e) { @@ -644,48 +644,73 @@ private void updatePartColumnStats(RawStore msdb, String dbName, String tableNam } } - private void updateTableColumnStatsForAlterTable(RawStore msdb, Table oldTable, Table newTable) + private void alterTableUpdateTableColumnStats(RawStore msdb, + Table oldTable, Table newTable) throws MetaException, InvalidObjectException { - String dbName = oldTable.getDbName(); - String tableName = oldTable.getTableName(); + String dbName = HiveStringUtils.normalizeIdentifier(oldTable.getDbName()); + String tableName = HiveStringUtils.normalizeIdentifier(oldTable.getTableName()); String newDbName = HiveStringUtils.normalizeIdentifier(newTable.getDbName()); String newTableName = HiveStringUtils.normalizeIdentifier(newTable.getTableName()); try { - if (!dbName.equals(newDbName) || !tableName.equals(newTableName)) { - msdb.deleteTableColumnStatistics(dbName, tableName, null); - } else { - List oldCols = oldTable.getSd().getCols(); - List newCols = newTable.getSd().getCols(); - if (!MetaStoreUtils.areSameColumns(oldCols, newCols)) { + List oldCols = oldTable.getSd().getCols(); + List newCols = newTable.getSd().getCols(); + List newStatsObjs = new ArrayList(); + ColumnStatistics colStats = null; + boolean updateColumnStats = true; + + // Nothing to update if everything is the same + if (newDbName.equals(dbName) && + newTableName.equals(tableName) && + MetaStoreUtils.areSameColumns(oldCols, newCols)) { + updateColumnStats = false; + } + + if (updateColumnStats) { List oldColNames = new ArrayList(oldCols.size()); for (FieldSchema oldCol : oldCols) { oldColNames.add(oldCol.getName()); } - ColumnStatistics cs = msdb.getTableColumnStatistics(dbName, tableName, oldColNames); - if (cs == null) { - return; - } + // Collect column stats which need to be rewritten and remove old stats + colStats = msdb.getTableColumnStatistics(dbName, tableName, oldColNames); + if (colStats == null) { + updateColumnStats = false; + } else { + List statsObjs = colStats.getStatsObj(); + if (statsObjs != null) { + for (ColumnStatisticsObj statsObj : statsObjs) { + boolean found = false; + for (FieldSchema newCol : newCols) { + if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) + && statsObj.getColType().equals(newCol.getType())) { + found = true; + break; + } + } - List statsObjs = cs.getStatsObj(); - if (statsObjs != null) { - for (ColumnStatisticsObj statsObj : statsObjs) { - boolean found = false; - for (FieldSchema newCol : newCols) { - if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) - && statsObj.getColType().equals(newCol.getType())) { - found = true; - break; + if (found) { + if (!newDbName.equals(dbName) || !newTableName.equals(tableName)) { + msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); + newStatsObjs.add(statsObj); + } + } else { + msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); } } - if (!found) { - msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); - } } } } - } + + // Change to new table and append stats for the new table + msdb.alterTable(dbName, tableName, newTable); + if (updateColumnStats && !newStatsObjs.isEmpty()) { + ColumnStatisticsDesc statsDesc = colStats.getStatsDesc(); + statsDesc.setDbName(newDbName); + statsDesc.setTableName(newTableName); + colStats.setStatsObj(newStatsObjs); + msdb.updateTableColumnStatistics(colStats); + } } catch (NoSuchObjectException nsoe) { LOG.debug("Could not find db entry." + nsoe); } catch (InvalidInputException e) { diff --git a/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q b/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q new file mode 100644 index 0000000..8e5766c --- /dev/null +++ b/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q @@ -0,0 +1,55 @@ +set hive.mapred.mode=nonstrict; +set hive.metastore.try.direct.sql=true; + +drop database if exists statsdb1; +create database statsdb1; +drop database if exists statsdb2; +create database statsdb2; + +create table statsdb1.testtable1 (col1 int, col2 string, col3 string); +insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10; + +use statsdb1; + +analyze table testtable1 compute statistics for columns; + +describe formatted statsdb1.testtable1 col1; +describe formatted statsdb1.testtable1 col2; +describe formatted statsdb1.testtable1 col3; + +alter table statsdb1.testtable1 rename to statsdb2.testtable2; +describe formatted statsdb2.testtable2 col1; +describe formatted statsdb2.testtable2 col2; +describe formatted statsdb2.testtable2 col3; + +use default; +drop database statsdb1 cascade; +drop database statsdb2 cascade; + + +set hive.metastore.try.direct.sql=false; + +drop database if exists statsdb1; +create database statsdb1; +drop database if exists statsdb2; +create database statsdb2; + +create table statsdb1.testtable1 (col1 int, col2 string, col3 string); +insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10; + +use statsdb1; + +analyze table testtable1 compute statistics for columns; + +describe formatted statsdb1.testtable1 col1; +describe formatted statsdb1.testtable1 col2; +describe formatted statsdb1.testtable1 col3; + +alter table statsdb1.testtable1 rename to statsdb2.testtable2; +describe formatted statsdb2.testtable2 col1; +describe formatted statsdb2.testtable2 col2; +describe formatted statsdb2.testtable2 col3; + +use default; +drop database statsdb1 cascade; +drop database statsdb2 cascade; diff --git a/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out b/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out new file mode 100644 index 0000000..c025cfa --- /dev/null +++ b/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out @@ -0,0 +1,280 @@ +PREHOOK: query: drop database if exists statsdb1 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb1 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb1 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: create database statsdb1 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database if exists statsdb2 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb2 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb2 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb2 +POSTHOOK: query: create database statsdb2 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb2 +PREHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:statsdb1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:statsdb1 +POSTHOOK: Output: statsdb1@testtable1 +PREHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Lineage: testtable1.col1 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col3 SIMPLE [] +PREHOOK: query: use statsdb1 +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:statsdb1 +POSTHOOK: query: use statsdb1 +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:statsdb1 +PREHOOK: query: analyze table testtable1 compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table testtable1 compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +PREHOOK: query: describe formatted statsdb1.testtable1 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +PREHOOK: type: ALTERTABLE_RENAME +PREHOOK: Input: statsdb1@testtable1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +POSTHOOK: type: ALTERTABLE_RENAME +POSTHOOK: Input: statsdb1@testtable1 +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Output: statsdb2@testtable2 +PREHOOK: query: describe formatted statsdb2.testtable2 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: use default +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:default +POSTHOOK: query: use default +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:default +PREHOOK: query: drop database statsdb1 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb1 +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: drop database statsdb1 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb1 +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database statsdb2 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb2 +PREHOOK: Output: database:statsdb2 +PREHOOK: Output: statsdb2@testtable2 +POSTHOOK: query: drop database statsdb2 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb2 +POSTHOOK: Output: database:statsdb2 +POSTHOOK: Output: statsdb2@testtable2 +PREHOOK: query: drop database if exists statsdb1 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb1 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb1 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: create database statsdb1 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database if exists statsdb2 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb2 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb2 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb2 +POSTHOOK: query: create database statsdb2 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb2 +PREHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:statsdb1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:statsdb1 +POSTHOOK: Output: statsdb1@testtable1 +PREHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Lineage: testtable1.col1 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col3 SIMPLE [] +PREHOOK: query: use statsdb1 +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:statsdb1 +POSTHOOK: query: use statsdb1 +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:statsdb1 +PREHOOK: query: analyze table testtable1 compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table testtable1 compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +PREHOOK: query: describe formatted statsdb1.testtable1 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +PREHOOK: type: ALTERTABLE_RENAME +PREHOOK: Input: statsdb1@testtable1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +POSTHOOK: type: ALTERTABLE_RENAME +POSTHOOK: Input: statsdb1@testtable1 +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Output: statsdb2@testtable2 +PREHOOK: query: describe formatted statsdb2.testtable2 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: use default +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:default +POSTHOOK: query: use default +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:default +PREHOOK: query: drop database statsdb1 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb1 +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: drop database statsdb1 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb1 +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database statsdb2 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb2 +PREHOOK: Output: database:statsdb2 +PREHOOK: Output: statsdb2@testtable2 +POSTHOOK: query: drop database statsdb2 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb2 +POSTHOOK: Output: database:statsdb2 +POSTHOOK: Output: statsdb2@testtable2