From 8fcecadd678337fed6ca5dea8830a60c9877bdea Mon Sep 17 00:00:00 2001 From: Ashu Pachauri Date: Thu, 12 Nov 2015 13:44:59 -0800 Subject: [PATCH] HBASE-14802: Fix handling of dead servers --- .../org/apache/hadoop/hbase/master/DeadServer.java | 38 ++++++++++++++++++---- .../master/procedure/ServerCrashProcedure.java | 3 +- .../apache/hadoop/hbase/master/TestDeadServer.java | 28 ++++++++++++++-- 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java index 8b16b00..df59652 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java @@ -54,11 +54,21 @@ public class DeadServer { private final Map deadServers = new HashMap(); /** + * Set of procedures' IDs currently operating the dead servers. + */ + private final Set procSet = new HashSet<>(); + + /** * Number of dead servers currently being processed */ private int numProcessing = 0; /** + * Whether a dead server is being processed currently. + */ + private boolean processing = false; + + /** * A dead server that comes back alive has a different start code. The new start code should be * greater than the old one, but we don't take this into account in this method. * @@ -94,9 +104,7 @@ public class DeadServer { * * @return true if any RS are being processed as dead */ - public synchronized boolean areDeadServersInProgress() { - return numProcessing != 0; - } + public synchronized boolean areDeadServersInProgress() { return processing; } public synchronized Set copyServerNames() { Set clone = new HashSet(deadServers.size()); @@ -109,15 +117,33 @@ public class DeadServer { * @param sn the server name */ public synchronized void add(ServerName sn) { - this.numProcessing++; + processing = true; if (!deadServers.containsKey(sn)){ deadServers.put(sn, EnvironmentEdgeManager.currentTime()); } } - public synchronized void finish(ServerName sn) { + /** + * Starting to process the dead server using the given procedure. + * @param sn ServerName for the dead server. + * @param procId ID of the procedure processing the dead server. + */ + public synchronized void process(ServerName sn, long procId) { + if (!this.procSet.contains(procId)) { + processing = true; + this.numProcessing++; + procSet.add(procId); + } + } + + public synchronized void finish(ServerName sn, long procId) { + if (procSet.contains(procId)) { + this.numProcessing--; + procSet.remove(procId); + } if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + this.numProcessing); - this.numProcessing--; + + if (numProcessing == 0) { processing = false; } } public synchronized int size() { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 1e86a23..1ad6675 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -183,6 +183,7 @@ implements ServerProcedureInterface { if (!services.getAssignmentManager().isFailoverCleanupDone()) { throwProcedureYieldException("Waiting on master failover to complete"); } + services.getServerManager().getDeadServers().process(serverName, this.getProcId()); try { switch (state) { case SERVER_CRASH_START: @@ -275,7 +276,7 @@ implements ServerProcedureInterface { case SERVER_CRASH_FINISH: LOG.info("Finished processing of crashed " + serverName); - services.getServerManager().getDeadServers().finish(serverName); + services.getServerManager().getDeadServers().finish(serverName, this.getProcId()); return Flow.NO_MORE_STATE; default: diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java index 40d26f4..075cde2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java @@ -17,7 +17,10 @@ */ package org.apache.hadoop.hbase.master; +import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; @@ -43,18 +46,21 @@ public class TestDeadServer { @Test public void testIsDead() { DeadServer ds = new DeadServer(); ds.add(hostname123); + ds.process(hostname123, 0); assertTrue(ds.areDeadServersInProgress()); - ds.finish(hostname123); + ds.finish(hostname123, 0); assertFalse(ds.areDeadServersInProgress()); ds.add(hostname1234); + ds.process(hostname1234, 1); assertTrue(ds.areDeadServersInProgress()); - ds.finish(hostname1234); + ds.finish(hostname1234, 1); assertFalse(ds.areDeadServersInProgress()); ds.add(hostname12345); + ds.process(hostname12345, 2); assertTrue(ds.areDeadServersInProgress()); - ds.finish(hostname12345); + ds.finish(hostname12345, 2); assertFalse(ds.areDeadServersInProgress()); // Already dead = 127.0.0.1,9090,112321 @@ -75,6 +81,22 @@ public class TestDeadServer { assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive)); } + @Test(timeout = 15000) + public void testCrashProcedureReplay() throws Exception { + HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + TEST_UTIL.startMiniCluster(); + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + ProcedureExecutor pExecutor = master.getMasterProcedureExecutor(); + + ServerCrashProcedure proc = new ServerCrashProcedure(hostname123, false, false); + + pExecutor.submitProcedure(proc); + while(!pExecutor.isFinished(proc.getProcId())) { + Thread.sleep(500); + } + + assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress()); + } @Test public void testSortExtract(){ -- 1.9.5