diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosExternalTables.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosExternalTables.java index 72da2f1ba3..bb1263ebd5 100644 --- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosExternalTables.java +++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosExternalTables.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.metastore.InjectableBehaviourObjectStore; import org.apache.hadoop.hive.metastore.InjectableBehaviourObjectStore.BehaviourInjection; import org.apache.hadoop.hive.metastore.InjectableBehaviourObjectStore.CallerArguments; +import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hadoop.hive.metastore.messaging.json.gzip.GzipJSONMessageEncoder; import org.apache.hadoop.hive.ql.ErrorMsg; @@ -673,6 +674,46 @@ public void retryIncBootstrapExternalTablesFromDifferentDumpWithoutCleanTablesCo ErrorMsg.REPL_BOOTSTRAP_LOAD_PATH_NOT_VALID.getErrorCode()); } + @Test + public void dynamicallyConvertManagedToExternalTable() throws Throwable { + List dumpWithClause = Collections.singletonList( + "'" + HiveConf.ConfVars.REPL_INCLUDE_EXTERNAL_TABLES.varname + "'='true'" + ); + List loadWithClause = externalTableBasePathWithClause(); + + WarehouseInstance.Tuple tupleBootstrapManagedTable = primary.run("use " + primaryDbName) + .run("create table t1 (id int)") + .run("insert into table t1 values (1)") + .dump(primaryDbName, null, dumpWithClause); + + replica.load(replicatedDbName, tupleBootstrapManagedTable.dumpLocation, loadWithClause); + + Hive hiveForReplica = Hive.get(replica.hiveConf); + Table replicaTable = hiveForReplica.getTable(replicatedDbName + ".t1"); + Path oldTblLoc = replicaTable.getDataLocation(); + + WarehouseInstance.Tuple tupleIncConvertToExternalTbl = primary.run("use " + primaryDbName) + .run("alter table t1 set tblproperties('EXTERNAL'='true')") + .dump(primaryDbName, tupleBootstrapManagedTable.lastReplicationId, dumpWithClause); + + assertExternalFileInfo(Collections.singletonList("t1"), + new Path(tupleIncConvertToExternalTbl.dumpLocation, FILE_NAME)); + replica.load(replicatedDbName, tupleIncConvertToExternalTbl.dumpLocation, loadWithClause) + .run("use " + replicatedDbName) + .run("select id from t1") + .verifyResult("1"); + + // Check if the table type is set correctly in target. + replicaTable = hiveForReplica.getTable(replicatedDbName + ".t1"); + assertTrue(TableType.EXTERNAL_TABLE.equals(replicaTable.getTableType())); + + // Verify if new table location is set inside the base directory. + assertTablePartitionLocation(primaryDbName + ".t1", replicatedDbName + ".t1"); + + // Old location should be removed and set to new location. + assertFalse(replica.miniDFSCluster.getFileSystem().exists(oldTblLoc)); + } + private List externalTableBasePathWithClause() throws IOException, SemanticException { Path externalTableLocation = new Path(REPLICA_EXTERNAL_BASE); DistributedFileSystem fileSystem = replica.miniDFSCluster.getFileSystem(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/CreateTableOperation.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/CreateTableOperation.java index af39c16570..250de12e48 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/CreateTableOperation.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/CreateTableOperation.java @@ -19,10 +19,15 @@ package org.apache.hadoop.hive.ql.ddl.table; import org.apache.commons.collections.CollectionUtils; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.ReplChangeManager; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.EnvironmentContext; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.ql.ddl.DDLOperation; @@ -54,17 +59,27 @@ public int execute() throws HiveException { Table tbl = desc.toTable(context.getConf()); LOG.debug("creating table {} on {}", tbl.getFullyQualifiedName(), tbl.getDataLocation()); - if (desc.getReplicationSpec().isInReplicationScope() && (!desc.getReplaceMode())){ - // if this is a replication spec, then replace-mode semantics might apply. - // if we're already asking for a table replacement, then we can skip this check. - // however, otherwise, if in replication scope, and we've not been explicitly asked - // to replace, we should check if the object we're looking at exists, and if so, + Path deleteOldDataLoc = null; + boolean isAutoPurge = false; + if (desc.getReplicationSpec().isInReplicationScope()) { + // If in replication scope, we should check if the object we're looking at exists, and if so, // trigger replace-mode semantics. Table existingTable = context.getDb().getTable(tbl.getDbName(), tbl.getTableName(), false); - if (existingTable != null){ + if (existingTable != null) { if (desc.getReplicationSpec().allowEventReplacementInto(existingTable.getParameters())) { desc.setReplaceMode(true); // we replace existing table. ReplicationSpec.copyLastReplId(existingTable.getParameters(), tbl.getParameters()); + + // If location of an existing managed table is changed, then need to delete the old location if exists. + // This scenario occurs when a managed table is converted into external table at source. In this case, + // at target, the table data would be moved to different location under base directory for external tables. + if (existingTable.getTableType().equals(TableType.MANAGED_TABLE) + || existingTable.getTableType().equals(TableType.MATERIALIZED_VIEW)) { + if (!existingTable.getDataLocation().equals(tbl.getDataLocation())) { + deleteOldDataLoc = existingTable.getDataLocation(); + isAutoPurge = "true".equalsIgnoreCase(existingTable.getProperty("auto.purge")); + } + } } else { LOG.debug("DDLTask: Create Table is skipped as table {} is newer than update", desc.getTableName()); return 0; // no replacement, the existing table state is newer than our update. @@ -79,6 +94,11 @@ public int execute() throws HiveException { createTableNonReplaceMode(tbl); } + // In replication load flow, if old data location has to be deleted, then delete it. + // Also, make sure it is recycled into CM if given DB is enabled for replication. + if (deleteOldDataLoc != null) { + deleteOldDataLocation(tbl.getDbName(), tbl.getTableName(), deleteOldDataLoc, isAutoPurge); + } DDLUtils.addIfAbsentByName(new WriteEntity(tbl, WriteEntity.WriteType.DDL_NO_LOCK), context); return 0; } @@ -134,6 +154,23 @@ private void createTableNonReplaceMode(Table tbl) throws HiveException { } } + private void deleteOldDataLocation(String dbName, String tableName, Path dataLocation, boolean isAutoPurge) { + try { + Database db = context.getDb().getDatabase(dbName); + FileSystem fs = dataLocation.getFileSystem(context.getConf()); + context.getDb().cleanUpOneDirectoryForReplace(dataLocation, fs, FileUtils.HIDDEN_FILES_PATH_FILTER, + context.getConf(), isAutoPurge, ReplChangeManager.isSourceOfReplication(db)); + fs.delete(dataLocation, true); + LOG.info("Deleted the old data location: {} for the table: {}", + dataLocation, dbName + "." + tableName); + } catch (Exception ex) { + // Eat the exception as it doesn't affect the state of existing tables. + // Expect, user to manually drop this path when exception and so logging a warning. + LOG.warn("Unable to delete the old data location: {} for the table: {}", + dataLocation, dbName + "." + tableName); + } + } + public static boolean doesTableNeedLocation(Table tbl) { // TODO: If we are ok with breaking compatibility of existing 3rd party StorageHandlers, // this method could be moved to the HiveStorageHandler interface.