From 32288f28bda6e375c239905367d72fdd47cc047a Mon Sep 17 00:00:00 2001 From: Ashu Pachauri Date: Thu, 12 Nov 2015 13:44:59 -0800 Subject: [PATCH] HBASE-14802: Fix handling of dead servers --- .../org/apache/hadoop/hbase/master/DeadServer.java | 34 ++++++++++++++++++---- .../master/procedure/ServerCrashProcedure.java | 12 ++++++++ .../apache/hadoop/hbase/master/TestDeadServer.java | 31 ++++++++++++++++++++ 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java index 8b16b00..c33cdcc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java @@ -59,6 +59,11 @@ public class DeadServer { private int numProcessing = 0; /** + * Whether a dead server is being processed currently. + */ + private boolean processing = false; + + /** * A dead server that comes back alive has a different start code. The new start code should be * greater than the old one, but we don't take this into account in this method. * @@ -94,9 +99,7 @@ public class DeadServer { * * @return true if any RS are being processed as dead */ - public synchronized boolean areDeadServersInProgress() { - return numProcessing != 0; - } + public synchronized boolean areDeadServersInProgress() { return processing; } public synchronized Set copyServerNames() { Set clone = new HashSet(deadServers.size()); @@ -109,15 +112,34 @@ public class DeadServer { * @param sn the server name */ public synchronized void add(ServerName sn) { - this.numProcessing++; + processing = true; if (!deadServers.containsKey(sn)){ deadServers.put(sn, EnvironmentEdgeManager.currentTime()); } } + /** + * Notify that we started processing this dead server. + * @param sn ServerName for the dead server. + */ + public synchronized void notifyServer(ServerName sn) { + if (LOG.isDebugEnabled()) { LOG.debug("Started processing " + sn); } + processing = true; + numProcessing++; + } + public synchronized void finish(ServerName sn) { - if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + this.numProcessing); - this.numProcessing--; + numProcessing--; + if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + numProcessing); + + assert numProcessing >= 0: "Number of dead servers in processing should always be non-negative"; + + if (numProcessing < 0) { + LOG.error("Number of dead servers in processing = " + numProcessing + + ". Something went wrong, this should always be non-negative."); + numProcessing = 0; + } + if (numProcessing == 0) { processing = false; } } public synchronized int size() { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 1e86a23..5c9f6f4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -110,6 +110,11 @@ implements ServerProcedureInterface { private ServerName serverName; /** + * Whether DeadServer knows that we are processing it. + */ + private boolean notifiedDeadServer = false; + + /** * Regions that were on the crashed server. */ private Set regionsOnCrashedServer; @@ -183,6 +188,13 @@ implements ServerProcedureInterface { if (!services.getAssignmentManager().isFailoverCleanupDone()) { throwProcedureYieldException("Waiting on master failover to complete"); } + // HBASE-14802 + // If we have not yet notified that we are processing a dead server, we should do now. + if (!notifiedDeadServer) { + services.getServerManager().getDeadServers().notifyServer(serverName); + notifiedDeadServer = true; + } + try { switch (state) { case SERVER_CRASH_START: diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java index 40d26f4..596612e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java @@ -17,13 +17,19 @@ */ package org.apache.hadoop.hbase.master; +import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.ManualEnvironmentEdge; import org.apache.hadoop.hbase.util.Pair; +import org.junit.AfterClass; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -35,24 +41,39 @@ import static org.junit.Assert.assertTrue; @Category({MasterTests.class, MediumTests.class}) public class TestDeadServer { + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + final ServerName hostname123 = ServerName.valueOf("127.0.0.1", 123, 3L); final ServerName hostname123_2 = ServerName.valueOf("127.0.0.1", 123, 4L); final ServerName hostname1234 = ServerName.valueOf("127.0.0.2", 1234, 4L); final ServerName hostname12345 = ServerName.valueOf("127.0.0.2", 12345, 4L); + @BeforeClass + public static void setupBeforeClass() throws Exception { + TEST_UTIL.startMiniCluster(); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + @Test public void testIsDead() { DeadServer ds = new DeadServer(); ds.add(hostname123); + ds.notifyServer(hostname123); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname123); assertFalse(ds.areDeadServersInProgress()); ds.add(hostname1234); + ds.notifyServer(hostname1234); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname1234); assertFalse(ds.areDeadServersInProgress()); ds.add(hostname12345); + ds.notifyServer(hostname12345); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname12345); assertFalse(ds.areDeadServersInProgress()); @@ -75,6 +96,16 @@ public class TestDeadServer { assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive)); } + @Test(timeout = 15000) + public void testCrashProcedureReplay() { + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + ProcedureExecutor pExecutor = master.getMasterProcedureExecutor(); + ServerCrashProcedure proc = new ServerCrashProcedure(hostname123, false, false); + + ProcedureTestingUtility.submitAndWait(pExecutor, proc); + + assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress()); + } @Test public void testSortExtract(){ -- 1.9.5