From 64b753657e7e54f3f1adb58e7ce2ca962a7f16e8 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Wed, 22 Aug 2018 18:40:08 +0800 Subject: [PATCH] HBASE-21095 The timeout retry logic for several procedures are broken after master restarts --- .../TransitRegionStateProcedure.java | 19 ++++++++++++++++--- .../procedure/ServerCrashProcedure.java | 15 +++++++++++---- .../TestCloseRegionWhileRSCrash.java | 8 +++++++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index e853b9bbac..daa9059906 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.ProcedureEvent; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; @@ -41,6 +42,7 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; @@ -99,6 +101,8 @@ public class TransitRegionStateProcedure private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class); + private final ProcedureEvent retryingEvent = new ProcedureEvent<>(this); + private RegionStateTransitionState initialState; private RegionStateTransitionState lastState; @@ -326,8 +330,8 @@ public class TransitRegionStateProcedure "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " + "by other Procedure or operator intervention", backoff / 1000, this, regionNode.toShortString(), e); - regionNode.getProcedureEvent().suspend(); - if (regionNode.getProcedureEvent().suspendIfNotReady(this)) { + retryingEvent.suspend(); + if (retryingEvent.suspendIfNotReady(this)) { setTimeout(Math.toIntExact(backoff)); setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); throw new ProcedureSuspendedException(); @@ -342,10 +346,19 @@ public class TransitRegionStateProcedure @Override protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { setState(ProcedureProtos.ProcedureState.RUNNABLE); - getRegionStateNode(env).getProcedureEvent().wake(env.getProcedureScheduler()); + retryingEvent.wake(env.getProcedureScheduler()); return false; // 'false' means that this procedure handled the timeout } + @Override + protected void afterReplay(MasterProcedureEnv env) { + if (getState() == ProcedureState.WAITING_TIMEOUT) { + // restore the suspend state + retryingEvent.suspend(); + retryingEvent.suspendIfNotReady(this); + } + } + private void reportTransitionOpened(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName, TransitionCode code, long openSeqNum) throws IOException { switch (code) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index db7a872c38..1fcc6eb6b9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -114,6 +114,17 @@ public class ServerCrashProcedure notifiedDeadServer = true; } + switch (state) { + case SERVER_CRASH_START: + case SERVER_CRASH_SPLIT_META_LOGS: + case SERVER_CRASH_ASSIGN_META: + break; + default: + // If hbase:meta is not assigned, yield. + if (env.getAssignmentManager().waitMetaLoaded(this)) { + throw new ProcedureSuspendedException(); + } + } try { switch (state) { case SERVER_CRASH_START: @@ -134,10 +145,6 @@ public class ServerCrashProcedure setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS); break; case SERVER_CRASH_GET_REGIONS: - // If hbase:meta is not assigned, yield. - if (env.getAssignmentManager().waitMetaLoaded(this)) { - throw new ProcedureSuspendedException(); - } this.regionsOnCrashedServer = services.getAssignmentManager().getRegionStates().getServerRegionInfoSet(serverName); // Where to go next? Depends on whether we should split logs at all or diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java index 9b1f4ca52e..88778a623d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; @@ -149,6 +150,7 @@ public class TestCloseRegionWhileRSCrash { @BeforeClass public static void setUp() throws Exception { + UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); UTIL.startMiniCluster(3); UTIL.createTable(TABLE_NAME, CF); UTIL.getAdmin().balancerSwitch(false, true); @@ -174,7 +176,7 @@ public class TestCloseRegionWhileRSCrash { HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); ProcedureExecutor procExec = UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); - procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); + long dummyProcId = procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); ARRIVE.await(); UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName()); UTIL.waitFor(30000, @@ -207,6 +209,10 @@ public class TestCloseRegionWhileRSCrash { Thread.sleep(1000); } RESUME.countDown(); + UTIL.waitFor(30000, () -> procExec.isFinished(dummyProcId)); + // here we restart + UTIL.getMiniHBaseCluster().stopMaster(0).join(); + UTIL.getMiniHBaseCluster().startMaster(); t.join(); // Make sure that the region is online, it may not on the original target server, as we will set // forceNewPlan to true if there is a server crash -- 2.17.1