diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 3f296c1..1ed8b6f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -840,7 +840,7 @@ public class HMaster extends HRegionServer implements MasterServices { this.regionServerTracker.start( procedureExecutor.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) .map(p -> ((ServerCrashProcedure) p).getServerName()).collect(Collectors.toSet()), - walManager.getLiveServersFromWALDir()); + walManager.getLiveServersFromWALDir(), walManager.getSplittingServersFromWALDir()); // This manager will be started AFTER hbase:meta is confirmed on line. // hbase.mirror.table.state.to.zookeeper is so hbase1 clients can connect. They read table // state from zookeeper while hbase2 reads it from hbase:meta. Disable if no hbase1 clients. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java index 2dc8918..e77f0bd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java @@ -143,19 +143,27 @@ public class MasterWalManager { return this.fsOk; } - public Set getLiveServersFromWALDir() throws IOException { + /** + * Get Servernames from WAL dir + * @param filter the filter to filter the dir wanted + * @return ServerName + * @throws IOException IOException + */ + private Set getServerNamesFromWALDir(final PathFilter filter) + throws IOException { Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); - FileStatus[] walDirForLiveServers = FSUtils.listStatus(fs, walDirPath, - p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); + FileStatus[] walDirForLiveServers = FSUtils + .listStatus(fs, walDirPath, filter); if (walDirForLiveServers == null) { return Collections.emptySet(); } return Stream.of(walDirForLiveServers).map(s -> { - ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath()); + ServerName serverName = AbstractFSWALProvider + .getServerNameFromWALDirectoryName(s.getPath()); if (serverName == null) { LOG.warn("Log folder {} doesn't look like its name includes a " + - "region server name; leaving in place. If you see later errors about missing " + - "write ahead logs they may be saved in this location.", s.getPath()); + "region server name; leaving in place. If you see later errors about missing " + + "write ahead logs they may be saved in this location.", s.getPath()); return null; } return serverName; @@ -163,6 +171,26 @@ public class MasterWalManager { } /** + * Get Servernames which are currently splitting + * @return ServerName + * @throws IOException IOException + */ + public Set getSplittingServersFromWALDir() throws IOException { + return getServerNamesFromWALDir( + p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); + } + + /** + * Get Servernames which are currently alive(not splitting) + * @return ServerName + * @throws IOException IOException + */ + public Set getLiveServersFromWALDir() throws IOException { + return getServerNamesFromWALDir( + p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); + } + + /** * Inspect the log directory to find dead servers which need recovery work * @return A set of ServerNames which aren't running but still have WAL files left in file system * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index 83c8afd..e3fa34d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -119,7 +119,9 @@ public class RegionServerTracker extends ZKListener { * @param deadServersFromPE the region servers which already have SCP associated. * @param liveServersFromWALDir the live region servers from wal directory. */ - public void start(Set deadServersFromPE, Set liveServersFromWALDir) + public void start(Set deadServersFromPE, + Set liveServersFromWALDir, + Set splittingServersFromWALDir) throws KeeperException, IOException { watcher.registerListener(this); synchronized (this) { @@ -136,7 +138,9 @@ public class RegionServerTracker extends ZKListener { : ServerMetricsBuilder.of(serverName); serverManager.checkAndRecordNewServer(serverName, serverMetrics); } - serverManager.findOutDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir); + // A bit strange we handle dead server here, is it has to be here? + serverManager.findOutDeadServersAndProcess(deadServersFromPE, + liveServersFromWALDir, splittingServersFromWALDir); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index db335c1..178b522 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -317,10 +317,13 @@ public class ServerManager { * @param liveServersFromWALDir the live region servers from wal directory. */ void findOutDeadServersAndProcess(Set deadServersFromPE, - Set liveServersFromWALDir) { + Set liveServersFromWALDir, Set splittingServersFromWALDir) { deadServersFromPE.forEach(deadservers::add); liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn)) .forEach(this::expireServer); + splittingServersFromWALDir.stream() + .filter(sn -> !deadServersFromPE.contains(sn)) + .forEach(this::expireServer); } /** diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java new file mode 100644 index 0000000..6c0c580 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java @@ -0,0 +1,79 @@ +package org.apache.hadoop.hbase.master.assignment; + +import java.util.stream.Collectors; + + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore.MASTER_PROCEDURE_LOGDIR; + +@Category({MasterTests.class, MediumTests.class}) +public class TestMetaInitIfAllProceduresLost { + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMetaInitIfAllProceduresLost.class); + + private static final Logger LOG = LoggerFactory + .getLogger(TestMetaInitIfAllProceduresLost.class); + + protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + @BeforeClass + public static void setupCluster() throws Exception { + UTIL.startMiniCluster(3); + } + + @AfterClass + public static void cleanupTest() throws Exception { + try { + UTIL.shutdownMiniCluster(); + } catch (Exception e) { + LOG.warn("failure shutting down cluster", e); + } + } + + @Test + public void test() throws Exception { + for (JVMClusterUtil.RegionServerThread rst : UTIL.getMiniHBaseCluster() + .getRegionServerThreads()) { + rst.getRegionServer().abort("killAll"); + } + //wait for a while, until all dirs are changed to '-splitting' + UTIL.waitFor(30000, () -> + UTIL.getMiniHBaseCluster().getMaster().getMasterWalManager() + .getLiveServersFromWALDir().size() == 0); + Thread.sleep(1000); + Path procedureWals = new Path( + UTIL.getMiniHBaseCluster().getMaster().getMasterFileSystem() + .getRootDir(), MASTER_PROCEDURE_LOGDIR); + //Kill the master + UTIL.getMiniHBaseCluster().killAll(); + //Delte all procedure log to create an anomaly + for (FileStatus file : UTIL.getTestFileSystem().listStatus(procedureWals)) { + LOG.info("removing " + file); + UTIL.getTestFileSystem().delete(file.getPath()); + } + UTIL.getMiniHBaseCluster().startMaster(); + UTIL.getMiniHBaseCluster().startRegionServer(); + UTIL.getMiniHBaseCluster().startRegionServer(); + UTIL.getMiniHBaseCluster().startRegionServer(); + //Master should able to finish init even if all procedures are lost + UTIL.waitFor(30000, () -> UTIL.getMiniHBaseCluster().getMaster() != null && UTIL + .getMiniHBaseCluster().getMaster().isInitialized()); + } + +}