diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupDriver.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupDriver.java index cc5cc95..e2cdb2f 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupDriver.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupDriver.java @@ -109,6 +109,8 @@ public class BackupDriver extends AbstractHBaseTool { type = BackupCommand.PROGRESS; } else if (BackupCommand.SET.name().equalsIgnoreCase(cmd)) { type = BackupCommand.SET; + } else if (BackupCommand.REPAIR.name().equalsIgnoreCase(cmd)) { + type = BackupCommand.REPAIR; } else { System.out.println("Unsupported command for backup: " + cmd); printToolUsage(); diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupRestoreConstants.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupRestoreConstants.java index 80f022f..48e70a1 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupRestoreConstants.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/BackupRestoreConstants.java @@ -117,7 +117,7 @@ public interface BackupRestoreConstants { public static enum BackupCommand { CREATE, CANCEL, DELETE, DESCRIBE, HISTORY, STATUS, CONVERT, MERGE, STOP, SHOW, HELP, PROGRESS, - SET, SET_ADD, SET_REMOVE, SET_DELETE, SET_DESCRIBE, SET_LIST + SET, SET_ADD, SET_REMOVE, SET_DELETE, SET_DESCRIBE, SET_LIST, REPAIR } } diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java index 211a706..a461295 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java @@ -58,6 +58,7 @@ import org.apache.hadoop.hbase.backup.util.BackupUtils; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import com.google.common.collect.Lists; @@ -77,6 +78,7 @@ public final class BackupCommands { + " history show history of all successful backups\n" + " progress show the progress of the latest backup request\n" + " set backup set management\n" + + " repair repair backup system table" + "Run \'hbase backup COMMAND -h\' to see help message for each command\n"; public static final String CREATE_CMD_USAGE = @@ -99,6 +101,8 @@ public final class BackupCommands { public static final String DELETE_CMD_USAGE = "Usage: hbase backup delete \n" + " backup_id Backup image id\n"; + public static final String REPAIR_CMD_USAGE = "Usage: hbase backup repair\n"; + public static final String CANCEL_CMD_USAGE = "Usage: hbase backup cancel \n" + " backup_id Backup image id\n"; @@ -191,6 +195,9 @@ public final class BackupCommands { case SET: cmd = new BackupSetCommand(conf, cmdline); break; + case REPAIR: + cmd = new RepairCommand(conf, cmdline); + break; case HELP: default: cmd = new HelpCommand(conf, cmdline); @@ -519,6 +526,66 @@ public final class BackupCommands { } } + private static class RepairCommand extends Command { + + RepairCommand(Configuration conf, CommandLine cmdline) { + super(conf); + this.cmdline = cmdline; + } + + @Override + public void execute() throws IOException { + super.execute(); + + String[] args = cmdline == null ? null : cmdline.getArgs(); + if (args != null && args.length > 1) { + System.err.println("ERROR: wrong number of arguments: " + args.length); + printUsage(); + throw new IOException(INCORRECT_USAGE); + } + + Configuration conf = getConf() != null ? getConf() : HBaseConfiguration.create(); + try (final Connection conn = ConnectionFactory.createConnection(conf); + final BackupSystemTable sysTable = new BackupSystemTable(conn);) { + + // Failed backup + BackupInfo backupInfo; + List list = sysTable.getBackupInfos(BackupState.RUNNING); + if (list.size() == 0) { + // No failed sessions found + System.err.println("No failed sessions found"); + return; + } + backupInfo = list.get(0); + // If this is a cancel exception, then we've already cleaned. + // set the failure timestamp of the overall backup + backupInfo.setCompleteTs(EnvironmentEdgeManager.currentTime()); + // set failure message + backupInfo.setFailedMsg("Repaired after failure:\n" + backupInfo); + // set overall backup status: failed + backupInfo.setState(BackupState.FAILED); + // compose the backup failed data + String backupFailedData = + "BackupId=" + backupInfo.getBackupId() + ",startts=" + backupInfo.getStartTs() + + ",failedts=" + backupInfo.getCompleteTs() + ",failedphase=" + + backupInfo.getPhase() + ",failedmessage=" + backupInfo.getFailedMsg(); + System.err.println(backupFailedData); + TableBackupClient.cleanupAndRestoreBackupSystem(conn, backupInfo, conf); + // If backup session is updated to FAILED state - means we + // processed recovery already. + sysTable.updateBackupInfo(backupInfo); + sysTable.finishBackupSession(); + System.out.println("Backup system " + backupInfo.getBackupId() + " failed."); + + } + } + + @Override + protected void printUsage() { + System.out.println(REPAIR_CMD_USAGE); + } + } + // TODO Cancel command private static class CancelCommand extends Command { diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestBackupRepair.java hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestBackupRepair.java new file mode 100644 index 0000000..ebc2d22 --- /dev/null +++ hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestBackupRepair.java @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.backup; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.backup.BackupInfo.BackupPhase; +import org.apache.hadoop.hbase.backup.BackupInfo.BackupState; +import org.apache.hadoop.hbase.backup.impl.BackupSystemTable; +import org.apache.hadoop.hbase.backup.impl.FullTableBackupClient; +import org.apache.hadoop.hbase.backup.impl.TableBackupClient; +import org.apache.hadoop.hbase.backup.master.LogRollMasterProcedureManager; +import org.apache.hadoop.hbase.backup.util.BackupUtils; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.util.ToolRunner; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import com.google.common.annotations.VisibleForTesting; + +@Category(LargeTests.class) +public class TestBackupRepair extends TestBackupBase { + + private static final Log LOG = LogFactory.getLog(TestBackupRepair.class); + + static class FullTableBackupClientForTest extends FullTableBackupClient + { + public static final String BACKUP_TEST_MODE_STAGE = "backup.test.mode.stage"; + + public FullTableBackupClientForTest() { + } + + public FullTableBackupClientForTest(Connection conn, String backupId, BackupRequest request) + throws IOException { + super(conn, backupId, request); + } + + @Override + public void execute() throws IOException + { + // Get the stage ID to fail on + try (Admin admin = conn.getAdmin();) { + // Begin BACKUP + beginBackup(backupManager, backupInfo); + failStageIf(0); + String savedStartCode = null; + boolean firstBackup = false; + // do snapshot for full table backup + savedStartCode = backupManager.readBackupStartCode(); + firstBackup = savedStartCode == null || Long.parseLong(savedStartCode) == 0L; + if (firstBackup) { + // This is our first backup. Let's put some marker to system table so that we can hold the logs + // while we do the backup. + backupManager.writeBackupStartCode(0L); + } + failStageIf(1); + // We roll log here before we do the snapshot. It is possible there is duplicate data + // in the log that is already in the snapshot. But if we do it after the snapshot, we + // could have data loss. + // A better approach is to do the roll log on each RS in the same global procedure as + // the snapshot. + LOG.info("Execute roll log procedure for full backup ..."); + + Map props = new HashMap(); + props.put("backupRoot", backupInfo.getBackupRootDir()); + admin.execProcedure(LogRollMasterProcedureManager.ROLLLOG_PROCEDURE_SIGNATURE, + LogRollMasterProcedureManager.ROLLLOG_PROCEDURE_NAME, props); + failStageIf(2); + newTimestamps = backupManager.readRegionServerLastLogRollResult(); + if (firstBackup) { + // Updates registered log files + // We record ALL old WAL files as registered, because + // this is a first full backup in the system and these + // files are not needed for next incremental backup + List logFiles = BackupUtils.getWALFilesOlderThan(conf, newTimestamps); + backupManager.recordWALFiles(logFiles); + } + + // SNAPSHOT_TABLES: + backupInfo.setPhase(BackupPhase.SNAPSHOT); + for (TableName tableName : tableList) { + String snapshotName = + "snapshot_" + Long.toString(EnvironmentEdgeManager.currentTime()) + "_" + + tableName.getNamespaceAsString() + "_" + tableName.getQualifierAsString(); + + snapshotTable(admin, tableName, snapshotName); + backupInfo.setSnapshotName(tableName, snapshotName); + } + failStageIf(3); + // SNAPSHOT_COPY: + // do snapshot copy + LOG.debug("snapshot copy for " + backupId); + try { + snapshotCopy(backupInfo); + } catch (Exception e) { + throw new IOException(e); + } + // Updates incremental backup table set + backupManager.addIncrementalBackupTableSet(backupInfo.getTables()); + + // BACKUP_COMPLETE: + // set overall backup status: complete. Here we make sure to complete the backup. + // After this checkpoint, even if entering cancel process, will let the backup finished + backupInfo.setState(BackupState.COMPLETE); + // The table list in backupInfo is good for both full backup and incremental backup. + // For incremental backup, it contains the incremental backup table set. + backupManager.writeRegionServerLogTimestamp(backupInfo.getTables(), newTimestamps); + + HashMap> newTableSetTimestampMap = + backupManager.readLogTimestampMap(); + + Long newStartCode = + BackupUtils.getMinValue(BackupUtils + .getRSLogTimestampMins(newTableSetTimestampMap)); + backupManager.writeBackupStartCode(newStartCode); + failStageIf(4); + // backup complete + completeBackup(conn, backupInfo, backupManager, BackupType.FULL, conf); + + } + + } + + + + @VisibleForTesting + protected int getTestStageId() { + return conf.getInt(BACKUP_TEST_MODE_STAGE, 0); + } + + @VisibleForTesting + + protected void failStageIf(int stage) throws IOException { + int current = getTestStageId(); + if (current == stage) { + throw new IOException("Failed stage " + stage+" in testing"); + } + } + + } + + @Test + public void testFullBackupWithFailuresAndRestore() throws Exception { + conf1.set(TableBackupClient.BACKUP_CLIENT_IMPL_CLASS, + FullTableBackupClientForTest.class.getName()); + int stage = (new Random()).nextInt(5); + // Fail random stage between 0 and 4 inclusive + LOG.info("Running stage " + stage); + runBackupAndFailAtStageWithRestore(stage); + } + + public void runBackupAndFailAtStageWithRestore(int stage) throws Exception { + + conf1.setInt(FullTableBackupClientForTest.BACKUP_TEST_MODE_STAGE, stage); + try (BackupSystemTable table = new BackupSystemTable(TEST_UTIL.getConnection())) { + int before = table.getBackupHistory().size(); + String[] args = + new String[] { "create", "full", BACKUP_ROOT_DIR, "-t", + table1.getNameAsString() + "," + table2.getNameAsString() }; + // Run backup + int ret = ToolRunner.run(conf1, new BackupDriver(), args); + assertFalse(ret == 0); + + // Now run restore + args = new String[] {"repair"}; + + ret = ToolRunner.run(conf1, new BackupDriver(), args); + assertTrue(ret == 0); + + List backups = table.getBackupHistory(); + int after = table.getBackupHistory().size(); + + assertTrue(after == before +1); + for (BackupInfo data : backups) { + String backupId = data.getBackupId(); + assertFalse(checkSucceeded(backupId)); + } + Set tables = table.getIncrementalBackupTableSet(BACKUP_ROOT_DIR); + assertTrue(tables.size() == 0); + } + } + + +} \ No newline at end of file