diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupAdminImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupAdminImpl.java index 30cabfd..03e9b89 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupAdminImpl.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupAdminImpl.java @@ -97,21 +97,72 @@ public class BackupAdminImpl implements BackupAdmin { int totalDeleted = 0; Map> allTablesMap = new HashMap>(); + boolean deleteSessionStarted = false; + boolean snapshotDone = false; try (final BackupSystemTable sysTable = new BackupSystemTable(conn)) { - for (int i = 0; i < backupIds.length; i++) { - BackupInfo info = sysTable.readBackupInfo(backupIds[i]); - if (info != null) { - String rootDir = info.getBackupRootDir(); - HashSet allTables = allTablesMap.get(rootDir); - if (allTables == null) { - allTables = new HashSet(); - allTablesMap.put(rootDir, allTables); + + // Step 1: Make sure there is no active session + // is running by using startBackupSession API + // If there is an active session in progress, exception will be thrown + try { + sysTable.startBackupSession(); + deleteSessionStarted = true; + } catch (IOException e) { + LOG.warn("You can not run delete command while active backup session is in progress. \n" + + "If there is no active backup session running, run backup repair utility to restore \n" + +"backup system integrity."); + return -1; + } + + // Step 2: Make sure there is no failed session + List list = sysTable.getBackupInfos(BackupState.RUNNING); + if (list.size() != 0) { + // ailed sessions found + LOG.warn("Failed backup session found. Run backup repair tool first."); + return -1; + } + + // Step 3: Record delete session + sysTable.startDeleteOperation(backupIds); + // Step 4: Snapshot backup system table + BackupSystemTable.snapshot(conn); + snapshotDone = true; + try { + for (int i = 0; i < backupIds.length; i++) { + BackupInfo info = sysTable.readBackupInfo(backupIds[i]); + if (info != null) { + String rootDir = info.getBackupRootDir(); + HashSet allTables = allTablesMap.get(rootDir); + if (allTables == null) { + allTables = new HashSet(); + allTablesMap.put(rootDir, allTables); + } + allTables.addAll(info.getTableNames()); + totalDeleted += deleteBackup(backupIds[i], sysTable); } - allTables.addAll(info.getTableNames()); - totalDeleted += deleteBackup(backupIds[i], sysTable); + } + finalizeDelete(allTablesMap, sysTable); + // Finish + sysTable.finishDeleteOperation(); + // delete snapshot + BackupSystemTable.deleteSnapshot(conn); + } catch (IOException e) { + // Fail delete operation + // Step 1 + if (snapshotDone) { + BackupSystemTable.restoreFromSnapshot(conn); + // delete snapshot + BackupSystemTable.deleteSnapshot(conn); + } + // We still have record with unfinished delete operation + LOG.error("Delete operation failed, please run backup repair utility to restore "+ + "backup system integrity", e); + throw e; + } finally { + if (deleteSessionStarted) { + sysTable.finishBackupSession(); } } - finalizeDelete(allTablesMap, sysTable); } return totalDeleted; } @@ -169,6 +220,7 @@ public class BackupAdminImpl implements BackupAdmin { int totalDeleted = 0; if (backupInfo != null) { LOG.info("Deleting backup " + backupInfo.getBackupId() + " ..."); + // Step 1: clean up data for backup session (idempotent) BackupUtils.cleanupBackupData(backupInfo, conn.getConfiguration()); // List of tables in this backup; List tables = backupInfo.getTableNames(); @@ -179,7 +231,7 @@ public class BackupAdminImpl implements BackupAdmin { continue; } // else - List affectedBackups = getAffectedBackupInfos(backupInfo, tn, sysTable); + List affectedBackups = getAffectedBackupSessions(backupInfo, tn, sysTable); for (BackupInfo info : affectedBackups) { if (info.equals(backupInfo)) { continue; @@ -189,7 +241,7 @@ public class BackupAdminImpl implements BackupAdmin { } Map map = sysTable.readBulkLoadedFiles(backupId); FileSystem fs = FileSystem.get(conn.getConfiguration()); - boolean succ = true; + boolean success = true; int numDeleted = 0; for (String f : map.values()) { Path p = new Path(f); @@ -198,20 +250,20 @@ public class BackupAdminImpl implements BackupAdmin { if (!fs.delete(p)) { if (fs.exists(p)) { LOG.warn(f + " was not deleted"); - succ = false; + success = false; } } else { numDeleted++; } } catch (IOException ioe) { LOG.warn(f + " was not deleted", ioe); - succ = false; + success = false; } } if (LOG.isDebugEnabled()) { LOG.debug(numDeleted + " bulk loaded files out of " + map.size() + " were deleted"); } - if (succ) { + if (success) { sysTable.deleteBulkLoadedFiles(map); } @@ -236,17 +288,18 @@ public class BackupAdminImpl implements BackupAdmin { LOG.debug("Delete backup info " + info.getBackupId()); sysTable.deleteBackupInfo(info.getBackupId()); + // Idempotent operation BackupUtils.cleanupBackupData(info, conn.getConfiguration()); } else { info.setTables(tables); sysTable.updateBackupInfo(info); - // Now, clean up directory for table + // Now, clean up directory for table (idempotent) cleanupBackupDir(info, tn, conn.getConfiguration()); } } } - private List getAffectedBackupInfos(BackupInfo backupInfo, TableName tn, + private List getAffectedBackupSessions(BackupInfo backupInfo, TableName tn, BackupSystemTable table) throws IOException { LOG.debug("GetAffectedBackupInfos for: " + backupInfo.getBackupId() + " table=" + tn); long ts = backupInfo.getStartTs(); diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java index 56ace8d..2a5c959 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java @@ -47,6 +47,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.backup.BackupAdmin; import org.apache.hadoop.hbase.backup.BackupInfo; import org.apache.hadoop.hbase.backup.BackupInfo.BackupState; import org.apache.hadoop.hbase.backup.BackupRequest; @@ -148,6 +149,18 @@ public final class BackupCommands { } } } + if (requiresConsistentState()) { + // Check failed delete + try (BackupSystemTable table = new BackupSystemTable(conn);) { + String[] ids = table.getListOfBackupIdsFromDeleteOperation(); + + if(ids !=null && ids.length > 0) { + System.err.println("Found failed backup delete coommand. "); + System.err.println("Backup system recovery is required."); + throw new IOException("Failed backup delete found, aborted command execution"); + } + } + } } public void finish() throws IOException { @@ -165,6 +178,15 @@ public final class BackupCommands { protected boolean requiresNoActiveSession() { return false; } + /** + * Command requires consistent state of a backup system + * Backup system may become inconsistent because of an abnormal + * termination of a backup session or delete command + * @return true, if yes + */ + protected boolean requiresConsistentState() { + return false; + } } private BackupCommands() { @@ -224,6 +246,11 @@ public final class BackupCommands { } @Override + protected boolean requiresConsistentState() { + return true; + } + + @Override public void execute() throws IOException { if (cmdline == null || cmdline.getArgs() == null) { printUsage(); @@ -556,7 +583,9 @@ public final class BackupCommands { List list = sysTable.getBackupInfos(BackupState.RUNNING); if (list.size() == 0) { // No failed sessions found - System.out.println("REPAIR status: no failed sessions found."); + System.out.println("REPAIR status: no failed sessions found." + +" Checking failed delete backup operation ..."); + repairFailedBackupDeletionIfAny(conn, sysTable); return; } backupInfo = list.get(0); @@ -583,6 +612,29 @@ public final class BackupCommands { } } + private void repairFailedBackupDeletionIfAny(Connection conn, BackupSystemTable sysTable) + throws IOException + { + String[] backupIds = sysTable.getListOfBackupIdsFromDeleteOperation(); + if (backupIds == null ||backupIds.length == 0) { + System.out.println("No failed backup delete operation found"); + // Delete backup table snapshot if exists + BackupSystemTable.deleteSnapshot(conn); + return; + } + System.out.println("Found failed delete operation for: " + StringUtils.join(backupIds)); + System.out.println("Running delete again ..."); + // Restore table from snapshot + BackupSystemTable.restoreFromSnapshot(conn); + // Finish previous failed session + sysTable.finishBackupSession(); + try(BackupAdmin admin = new BackupAdminImpl(conn);) { + admin.deleteBackups(backupIds); + } + System.out.println("Delete operation finished OK: "+ StringUtils.join(backupIds)); + + } + @Override protected void printUsage() { System.out.println(REPAIR_CMD_USAGE); diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupSystemTable.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupSystemTable.java index 2a0815f..616a7d3 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupSystemTable.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupSystemTable.java @@ -59,6 +59,7 @@ import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.SnapshotDescription; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.shaded.protobuf.generated.BackupProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; @@ -145,6 +146,8 @@ public final class BackupSystemTable implements Closeable { private final static String BULK_LOAD_PREFIX = "bulk:"; private final static byte[] BULK_LOAD_PREFIX_BYTES = BULK_LOAD_PREFIX.getBytes(); + private final static byte[] DELETE_OP_ROW = "delete_op_row".getBytes(); + final static byte[] TBL_COL = Bytes.toBytes("tbl"); final static byte[] FAM_COL = Bytes.toBytes("fam"); final static byte[] PATH_COL = Bytes.toBytes("path"); @@ -1602,6 +1605,65 @@ public final class BackupSystemTable implements Closeable { return puts; } + public static void snapshot(Connection conn) throws IOException { + + try (Admin admin = conn.getAdmin();){ + Configuration conf = conn.getConfiguration(); + admin.snapshot(BackupSystemTable.getSnapshotName(conf), + BackupSystemTable.getTableName(conf)); + } + } + + public static void restoreFromSnapshot(Connection conn) + throws IOException { + + Configuration conf = conn.getConfiguration(); + LOG.debug("Restoring " + BackupSystemTable.getTableNameAsString(conf) + + " from snapshot"); + try (Admin admin = conn.getAdmin();) { + String snapshotName = BackupSystemTable.getSnapshotName(conf); + if (snapshotExists(admin, snapshotName)) { + admin.disableTable(BackupSystemTable.getTableName(conf)); + admin.restoreSnapshot(snapshotName); + admin.enableTable(BackupSystemTable.getTableName(conf)); + LOG.debug("Done restoring backup system table"); + } else { + // Snapshot does not exists, i.e completeBackup failed after + // deleting backup system table snapshot + // In this case we log WARN and proceed + LOG.warn("Could not restore backup system table. Snapshot " + snapshotName+ + " does not exists."); + } + } + } + + protected static boolean snapshotExists(Admin admin, String snapshotName) throws IOException { + + List list = admin.listSnapshots(); + for (SnapshotDescription desc: list) { + if (desc.getName().equals(snapshotName)) { + return true; + } + } + return false; + } + + public static void deleteSnapshot(Connection conn) + throws IOException { + + Configuration conf = conn.getConfiguration(); + LOG.debug("Deleting " + BackupSystemTable.getSnapshotName(conf) + + " from the system"); + try (Admin admin = conn.getAdmin();) { + String snapshotName = BackupSystemTable.getSnapshotName(conf); + if (snapshotExists(admin, snapshotName)) { + admin.deleteSnapshot(snapshotName); + LOG.debug("Done deleting backup system table snapshot"); + } else { + LOG.error("Snapshot "+snapshotName+" does not exists"); + } + } + } /* * Creates Put's for bulk load resulting from running LoadIncrementalHFiles */ @@ -1626,6 +1688,7 @@ public final class BackupSystemTable implements Closeable { } return puts; } + public static List createDeleteForOrigBulkLoad(List lst) { List lstDels = new ArrayList<>(); for (TableName table : lst) { @@ -1636,6 +1699,68 @@ public final class BackupSystemTable implements Closeable { return lstDels; } + private Put createPutForDeleteOperation(String[] backupIdList) { + + byte[] value = Bytes.toBytes(StringUtils.join(backupIdList, ",")); + Put put = new Put(DELETE_OP_ROW); + put.addColumn(META_FAMILY, FAM_COL, value); + return put; + } + + private Delete createDeleteForBackupDeleteOperation() { + + Delete delete = new Delete(DELETE_OP_ROW); + delete.addFamily(META_FAMILY); + return delete; + } + + private Get createGetForDeleteOperation() { + + Get get = new Get(DELETE_OP_ROW); + get.addFamily(META_FAMILY); + return get; + } + + + public void startDeleteOperation(String[] backupIdList) throws IOException { + if (LOG.isTraceEnabled()) { + LOG.trace("Start delete operation for backups: " + StringUtils.join(backupIdList)); + } + Put put = createPutForDeleteOperation(backupIdList); + try (Table table = connection.getTable(tableName)) { + table.put(put); + } + } + + public void finishDeleteOperation() throws IOException { + if (LOG.isTraceEnabled()) { + LOG.trace("Finsih delete operation for backup ids "); + } + Delete delete = createDeleteForBackupDeleteOperation(); + try (Table table = connection.getTable(tableName)) { + table.delete(delete); + } + } + + public String[] getListOfBackupIdsFromDeleteOperation() throws IOException { + if (LOG.isTraceEnabled()) { + LOG.trace("Get delete operation for backup ids "); + } + Get get = createGetForDeleteOperation(); + try (Table table = connection.getTable(tableName)) { + Result res = table.get(get); + if (res.isEmpty()) { + return null; + } + Cell cell = res.listCells().get(0); + byte[] val = CellUtil.cloneValue(cell); + if (val.length == 0) { + return null; + } + return new String(val).split(","); + } + } + static Scan createScanForOrigBulkLoadedFiles(TableName table) throws IOException { Scan scan = new Scan(); byte[] startRow = rowkey(BULK_LOAD_PREFIX, table.toString(), BLK_LD_DELIM); diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/TableBackupClient.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/TableBackupClient.java index 4e1f277..96486ad 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/TableBackupClient.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/TableBackupClient.java @@ -40,7 +40,6 @@ import org.apache.hadoop.hbase.backup.impl.BackupManifest.BackupImage; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Connection; -import org.apache.hadoop.hbase.client.SnapshotDescription; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSUtils; @@ -109,7 +108,7 @@ public abstract class TableBackupClient { protected void beginBackup(BackupManager backupManager, BackupInfo backupInfo) throws IOException { - snapshotBackupTable(); + BackupSystemTable.snapshot(conn); backupManager.setBackupInfo(backupInfo); // set the start timestamp of the overall backup long startTs = EnvironmentEdgeManager.currentTime(); @@ -269,69 +268,15 @@ public abstract class TableBackupClient { deleteSnapshots(conn, backupInfo, conf); cleanupExportSnapshotLog(conf); } - restoreBackupTable(conn, conf); - deleteBackupTableSnapshot(conn, conf); + BackupSystemTable.restoreFromSnapshot(conn); + BackupSystemTable.deleteSnapshot(conn); // clean up the uncompleted data at target directory if the ongoing backup has already entered // the copy phase // For incremental backup, DistCp logs will be cleaned with the targetDir. cleanupTargetDir(backupInfo, conf); } - protected void snapshotBackupTable() throws IOException { - try (Admin admin = conn.getAdmin();){ - admin.snapshot(BackupSystemTable.getSnapshotName(conf), - BackupSystemTable.getTableName(conf)); - } - } - - protected static void restoreBackupTable(Connection conn, Configuration conf) - throws IOException { - - LOG.debug("Restoring " + BackupSystemTable.getTableNameAsString(conf) + - " from snapshot"); - try (Admin admin = conn.getAdmin();) { - String snapshotName = BackupSystemTable.getSnapshotName(conf); - if (snapshotExists(admin, snapshotName)) { - admin.disableTable(BackupSystemTable.getTableName(conf)); - admin.restoreSnapshot(snapshotName); - admin.enableTable(BackupSystemTable.getTableName(conf)); - LOG.debug("Done restoring backup system table"); - } else { - // Snapshot does not exists, i.e completeBackup failed after - // deleting backup system table snapshot - // In this case we log WARN and proceed - LOG.error("Could not restore backup system table. Snapshot " + snapshotName+ - " does not exists."); - } - } - } - - protected static boolean snapshotExists(Admin admin, String snapshotName) throws IOException { - - List list = admin.listSnapshots(); - for (SnapshotDescription desc: list) { - if (desc.getName().equals(snapshotName)) { - return true; - } - } - return false; - } - - protected static void deleteBackupTableSnapshot(Connection conn, Configuration conf) - throws IOException { - LOG.debug("Deleting " + BackupSystemTable.getSnapshotName(conf) + - " from the system"); - try (Admin admin = conn.getAdmin();) { - String snapshotName = BackupSystemTable.getSnapshotName(conf); - if (snapshotExists(admin, snapshotName)) { - admin.deleteSnapshot(snapshotName); - LOG.debug("Done deleting backup system table snapshot"); - } else { - LOG.error("Snapshot "+snapshotName+" does not exists"); - } - } - } /** * Add manifest for the current backup. The manifest is stored within the table backup directory. @@ -457,7 +402,7 @@ public abstract class TableBackupClient { } else if (type == BackupType.INCREMENTAL) { cleanupDistCpLog(backupInfo, conf); } - deleteBackupTableSnapshot(conn, conf); + BackupSystemTable.deleteSnapshot(conn); backupManager.updateBackupInfo(backupInfo); // Finish active session diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestRepairAfterFailedDelete.java hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestRepairAfterFailedDelete.java new file mode 100644 index 0000000..9beb47b --- /dev/null +++ hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestRepairAfterFailedDelete.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.backup; + +import static org.junit.Assert.assertTrue; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.backup.impl.BackupSystemTable; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.util.ToolRunner; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import com.google.common.collect.Lists; + +@Category(LargeTests.class) +public class TestRepairAfterFailedDelete extends TestBackupBase { + + private static final Log LOG = LogFactory.getLog(TestRepairAfterFailedDelete.class); + + @Test + public void testRepairBackupDelete() throws Exception { + LOG.info("test repair backup delete on a single table with data"); + List tableList = Lists.newArrayList(table1); + String backupId = fullTableBackup(tableList); + assertTrue(checkSucceeded(backupId)); + LOG.info("backup complete"); + String[] backupIds = new String[] { backupId }; + BackupSystemTable table = new BackupSystemTable(TEST_UTIL.getConnection()); + BackupInfo info = table.readBackupInfo(backupId); + Path path = new Path(info.getBackupRootDir(), backupId); + FileSystem fs = FileSystem.get(path.toUri(), conf1); + assertTrue(fs.exists(path)); + + // Snapshot backup system table before delete + String snapshotName = "snapshot-backup"; + Connection conn = TEST_UTIL.getConnection(); + Admin admin = conn.getAdmin(); + admin.snapshot(snapshotName, BackupSystemTable.getTableName(conf1)); + + int deleted = getBackupAdmin().deleteBackups(backupIds); + + assertTrue(!fs.exists(path)); + assertTrue(fs.exists(new Path(info.getBackupRootDir()))); + assertTrue(1 == deleted); + + // Emulate delete failure + // Restore backup system table + admin.disableTable(BackupSystemTable.getTableName(conf1)); + admin.restoreSnapshot(snapshotName); + admin.enableTable(BackupSystemTable.getTableName(conf1)); + // Start backup session + table.startBackupSession(); + // Start delete operation + table.startDeleteOperation(backupIds); + + // Now run repair command to repair "failed" delete operation + String[] args = new String[] {"repair"}; + // Run restore + int ret = ToolRunner.run(conf1, new BackupDriver(), args); + assertTrue(ret == 0); + // Verify that history length == 0 + assertTrue (table.getBackupHistory().size() == 0); + table.close(); + admin.close(); + } + + +}