From 40fc8ae86d2fcd8d941264216e87b44899de9acb Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Mon, 5 Mar 2018 21:20:23 -0800 Subject: [PATCH] HBASE-20137 TestRSGroups is flakey On failed RPC we expire the server and suspend expecting the resultant ServerCrashProcedure to wake us back up again. In tests, TestRSGroup hung because it failed to schedule a server expiration because the server was already expired undergoing processing (the test was shutting down). Deal with this case by having expire servers return false if unable to expire. Callers will then know where a ServerCrashProcedure has been scheduled or not. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Have expireServer return true if successful. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java The log that included an exception whose message was the current procedure as a String totally baffled me. Make it more obvious what exception is. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java If failed expire of a server, wake our procedure -- do not suspend -- and presume ok to move region to CLOSED state (because going down or concurrent crashed server processing ongoing). --- .../apache/hadoop/hbase/master/ServerManager.java | 17 ++++---- .../assignment/RegionTransitionProcedure.java | 8 ++-- .../hbase/master/assignment/UnassignProcedure.java | 46 +++++++++++++--------- 3 files changed, 41 insertions(+), 30 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 06d6c8b727..e2f0b6b309 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -555,15 +555,17 @@ public class ServerManager { } /* - * Expire the passed server. Add it to list of dead servers and queue a - * shutdown processing. + * Expire the passed server. Add it to list of dead servers and queue a shutdown processing. + * @return True if we expired passed serverName else false if we failed to schedule + * an expire (and attendant ServerCrashProcedure -- some clients are dependent on + * server crash procedure being queued and need to know if has not been queued). */ - public synchronized void expireServer(final ServerName serverName) { + public synchronized boolean expireServer(final ServerName serverName) { if (serverName.equals(master.getServerName())) { if (!(master.isAborted() || master.isStopped())) { master.stop("We lost our znode?"); } - return; + return false; } if (!master.isServerCrashProcessingEnabled()) { LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " @@ -573,13 +575,13 @@ public class ServerManager { // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287 master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); this.queuedDeadServers.add(serverName); - return; + return false; } if (this.deadservers.isDeadServer(serverName)) { // TODO: Can this happen? It shouldn't be online in this case? LOG.warn("Expiration of " + serverName + " but server shutdown already in progress"); - return; + return false; } moveFromOnlineToDeadServers(serverName); @@ -591,7 +593,7 @@ public class ServerManager { if (this.onlineServers.isEmpty()) { master.stop("Cluster shutdown set; onlineServer=0"); } - return; + return false; } LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName()); master.getAssignmentManager().submitServerCrash(serverName, true); @@ -602,6 +604,7 @@ public class ServerManager { listener.serverRemoved(serverName); } } + return true; } @VisibleForTesting diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index 6c63cb83be..25816d8db5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -182,10 +182,8 @@ public abstract class RegionTransitionProcedure public void remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); - String msg = exception.getMessage() == null? exception.getClass().getSimpleName(): - exception.getMessage(); - LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() + - "; exception=" + msg); + LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(), + exception.toString()); if (remoteCallFailed(env, regionNode, exception)) { // NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond @@ -222,7 +220,7 @@ public abstract class RegionTransitionProcedure // wake to undo the above suspend. if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) { remoteCallFailed(env, targetServer, - new FailedRemoteDispatchException(this + " to " + targetServer)); + new FailedRemoteDispatchException(targetServer.toShortString())); return false; } return true; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index 3454d96487..8d60318e80 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -249,17 +249,12 @@ public class UnassignProcedure extends RegionTransitionProcedure { final IOException exception) { // TODO: Is there on-going rpc to cleanup? if (exception instanceof ServerCrashException) { - // This exception comes from ServerCrashProcedure after log splitting. - // SCP found this region as a RIT. Its call into here says it is ok to let this procedure go - // on to a complete close now. This will release lock on this region so subsequent action on - // region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP - // the assign could run w/o logs being split so data loss). - try { - reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM); - } catch (UnexpectedStateException e) { - // Should never happen. - throw new RuntimeException(e); - } + // This exception comes from ServerCrashProcedure after it is done with log splitting. + // SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to + // let this procedure go on to a complete close now. This will release lock on this region so + // subsequent action on region can succeed; e.g. the assign that follows this unassign when + // a move (w/o wait on SCP the assign could run w/o logs being split so data loss). + reportTransitionCLOSED(env, regionNode); } else if (exception instanceof RegionServerAbortedException || exception instanceof RegionServerStoppedException || exception instanceof ServerNotRunningYetException) { @@ -273,17 +268,32 @@ public class UnassignProcedure extends RegionTransitionProcedure { exception); setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH); } else { - LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() + - ", exception=" + exception); - env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation()); - // Return false so this procedure stays in suspended state. It will be woken up by a - // ServerCrashProcedure when it notices this RIT. - // TODO: Add a SCP as a new subprocedure that we now come to depend on. - return false; + LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(), + exception.toString()); + if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) { + // Return false so this procedure stays in suspended state. It will be woken up by + // ServerCrashProcedure when it notices this RIT and calls this method again but with + // a SCPException -- see above. + // TODO: Add a SCP as a new subprocedure that we now come to depend on. + return false; + } else { + LOG.warn("Failed expire {}; presumed in crash processing; moving region to CLOSED state"); + reportTransitionCLOSED(env, regionNode); + } } return true; } + private void reportTransitionCLOSED(final MasterProcedureEnv env, + final RegionStateNode regionNode) { + try { + reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM); + } catch (UnexpectedStateException e) { + // Should never happen. + throw new RuntimeException(e); + } + } + @Override public void toStringClassDetails(StringBuilder sb) { super.toStringClassDetails(sb); -- 2.11.0 (Apple Git-81)