diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java index d0b8280d62..d068bc3cb3 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java @@ -233,7 +233,7 @@ public abstract class RemoteProcedureDispatcher { RemoteOperation remoteCallBuild(TEnv env, TRemote remote); void remoteCallCompleted(TEnv env, TRemote remote, RemoteOperation response); - void remoteCallFailed(TEnv env, TRemote remote, IOException exception); + boolean remoteCallFailed(TEnv env, TRemote remote, IOException exception); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index a0297b3ac3..f2b967afa7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; +import org.apache.hadoop.hbase.master.procedure.ServerCrashException; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; @@ -388,6 +389,17 @@ public class AssignProcedure extends RegionTransitionProcedure { @Override protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode, final IOException exception) { + RegionTransitionState tState = getTransitionState(); + if (tState == RegionTransitionState.REGION_TRANSITION_FINISH + && exception instanceof ServerCrashException) { + // if we found that AssignProcedure is at this stage, then ServerCerash handling may/may not + // have any effect + // depending upon the race between handling of the failure and execution at + // REGION_TRANSITION_FINISH state + LOG.warn("Assign Procedure is at state:" + tState + + ", so Handling of Server Crash may not have any affect"); + return false; + } handleFailure(env, regionNode); return true; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 04a25c7748..00b70d7e38 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -42,7 +42,6 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; -import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; @@ -64,7 +63,6 @@ import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler; import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; -import org.apache.hadoop.hbase.master.procedure.ServerCrashException; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureEvent; @@ -88,7 +86,6 @@ import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; -import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; @@ -1858,42 +1855,6 @@ public class AssignmentManager implements ServerListener { } } - /** - * Handle RIT of meta region against crashed server. - * Only used when ServerCrashProcedure is not enabled. - * See handleRIT in ServerCrashProcedure for similar function. - * - * @param serverName Server that has already crashed - */ - public void handleMetaRITOnCrashedServer(ServerName serverName) { - RegionInfo hri = RegionReplicaUtil - .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, - RegionInfo.DEFAULT_REPLICA_ID); - RegionState regionStateNode = getRegionStates().getRegionState(hri); - if (regionStateNode == null) { - LOG.warn("RegionStateNode is null for " + hri); - return; - } - ServerName rsnServerName = regionStateNode.getServerName(); - if (rsnServerName != null && !rsnServerName.equals(serverName)) { - return; - } else if (rsnServerName == null) { - LOG.warn("Empty ServerName in RegionStateNode; proceeding anyways in case latched " + - "RecoverMetaProcedure so meta latch gets cleaned up."); - } - // meta has been assigned to crashed server. - LOG.info("Meta assigned to crashed " + serverName + "; reassigning..."); - // Handle failure and wake event - RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri); - // Do not need to consider for REGION_TRANSITION_QUEUE step - if (rtp != null && rtp.isMeta() && - rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) { - LOG.debug("Failing " + rtp.toString()); - rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName, - new ServerCrashException(rtp.getProcId(), serverName)); - } - } - @VisibleForTesting MasterServices getMaster() { return master; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index 00ace72dbe..b8fda062cc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -205,7 +205,7 @@ public abstract class RegionTransitionProcedure this.transitionState = state; } - RegionTransitionState getTransitionState() { + protected RegionTransitionState getTransitionState() { return transitionState; } @@ -245,7 +245,7 @@ public abstract class RegionTransitionProcedure } @Override - public synchronized void remoteCallFailed(final MasterProcedureEnv env, + public synchronized boolean remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception); @@ -254,7 +254,9 @@ public abstract class RegionTransitionProcedure // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond // this method. Just get out of this current processing quickly. regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); + return true; } + return false; // else leave the procedure in suspended state; it is waiting on another call to this callback } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 6ec0079875..9fe5545407 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -419,13 +419,15 @@ public class ServerCrashProcedure if (sce == null) { sce = new ServerCrashException(getProcId(), getServerName()); } - rtp.remoteCallFailed(env, this.serverName, sce); - // If an assign, remove from passed-in list of regions so we subsequently do not create - // a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate - // and assign to a server other than the crashed one; no need to create new assign. - // If an unassign, do not return this region; the above cancel will wake up the unassign and - // it will complete. Done. - it.remove(); + if(rtp.remoteCallFailed(env, this.serverName, sce)) { + // If an assign, remove from passed-in list of regions so we subsequently do not create + // a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate + // and assign to a server other than the crashed one; no need to create new assign. + // If an unassign, do not return this region; the above cancel will wake up the unassign and + // it will complete. Done. + it.remove(); + } + } return toAssign; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java index 8836c9f75e..5067fc361c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java @@ -108,11 +108,12 @@ public class TestAssignProcedure { } @Override - public void remoteCallFailed(final MasterProcedureEnv env, + public boolean remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { // Just skip this remoteCallFailed. Its too hard to mock. Assert it is called though. // Happens after the code we are testing has been called. this.remoteCallFailedWasCalled.set(true); + return true; } };