From 72f6dd010dc669961d02637d5d57addacd1849cd Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 27 Jun 2018 21:44:58 -0700 Subject: [PATCH] HBASE-20796 STUCK RIT though region successfully assigned On assign, we can get multiple handleFailure calls; once from the SCP and then later, if the RPC is stuck on this server, as part of the RPC cleanup. Add checks so we drop the second call on the floor rather than 'process' it to 'wake' a Procedure that has already handled the failure (If the Procedure is done, logs get filled with notice of the STUCK 'zombie' Procedure). UnassignProcedure, while it has a different form, already had protection against double running of failure handling; here we are adding it to AssignProcedure. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java Have handleFailure return true if it 'processed' the failure, else false if the call comes in and regionNode is not in expected state; if the latter, just ignore the call. Bug fix! We were calling offline of regionNode BEFORE we asked am to undoRegionAsOpening. This looks like it should be AFTER the am has done its cleanup; in fact, let am set the regionNode to OFFLINE as it has a lock on regionNode already. Only set Procedure to go back to the start if we 'processed' the failure; else just exit after logging we're ignoring the invocation. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java Return true if we ran an undo of OPENING state. Added setting node into OFFLINE state in here. M hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedure.java This test seemed to be in wrong package. Moved it. And then added test that we don't do handleFailure twice. --- .../hbase/master/assignment/AssignProcedure.java | 16 +++++--- .../hbase/master/assignment/AssignmentManager.java | 4 +- .../TestAssignProcedure.java | 45 ++++++++++++++++++++-- 3 files changed, 54 insertions(+), 11 deletions(-) rename hbase-server/src/test/java/org/apache/hadoop/hbase/master/{snapshot => assignment}/TestAssignProcedure.java (83%) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 86f0a3ff59..73500736a4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -319,18 +319,23 @@ public class AssignProcedure extends RegionTransitionProcedure { * inline dispatch call or later by the ServerCrashProcedure. Our state is * generally OPENING. Cleanup and reset to OFFLINE and put our Procedure * State back to REGION_TRANSITION_QUEUE so the Assign starts over. + * @return True is handled, false if ignored because unexpected state. */ - private void handleFailure(final MasterProcedureEnv env, final RegionStateNode regionNode) { + private boolean handleFailure(final MasterProcedureEnv env, final RegionStateNode regionNode) { if (incrementAndCheckMaxAttempts(env, regionNode)) { aborted.set(true); } this.forceNewPlan = true; this.targetServer = null; - regionNode.offline(); // We were moved to OPENING state before dispatch. Undo. It is safe to call // this method because it checks for OPENING first. - env.getAssignmentManager().undoRegionAsOpening(regionNode); - setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE); + if (env.getAssignmentManager().undoRegionAsOpening(regionNode)) { + setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE); + return true; + } else { + LOG.warn("Ignored; unexpected state {}", regionNode); + return false; + } } private boolean incrementAndCheckMaxAttempts(final MasterProcedureEnv env, @@ -353,8 +358,7 @@ public class AssignProcedure extends RegionTransitionProcedure { @Override protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode, final IOException exception) { - handleFailure(env, regionNode); - return true; + return handleFailure(env, regionNode); } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 27a1b74186..b7c01077a5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1503,18 +1503,20 @@ public class AssignmentManager implements ServerListener { metrics.incrementOperationCounter(); } - public void undoRegionAsOpening(final RegionStateNode regionNode) { + public boolean undoRegionAsOpening(final RegionStateNode regionNode) { boolean opening = false; synchronized (regionNode) { if (regionNode.isInState(State.OPENING)) { opening = true; regionStates.removeRegionFromServer(regionNode.getRegionLocation(), regionNode); + regionNode.offline(); } // Should we update hbase:meta? } if (opening) { // TODO: Metrics. Do opposite of metrics.incrementOperationCounter(); } + return opening; } public void markRegionAsOpened(final RegionStateNode regionNode) throws IOException { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedure.java similarity index 83% rename from hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java rename to hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedure.java index 8836c9f75e..572c01328e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestAssignProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedure.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hbase.master.snapshot; +package org.apache.hadoop.hbase.master.assignment; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -29,16 +29,16 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.ServerManager; -import org.apache.hadoop.hbase.master.assignment.AssignProcedure; -import org.apache.hadoop.hbase.master.assignment.AssignmentManager; -import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.testclassification.RegionServerTests; @@ -213,4 +213,41 @@ public class TestAssignProcedure { } } } + + @Test + public void testDoubleHandleFailureCall() throws IOException { + RegionInfo ri = + RegionInfoBuilder.newBuilder(TableName.valueOf(this.name.getMethodName())).build(); + ServerName sn = ServerName.valueOf("server.example.org", 0, 0); + + ServerManager sm = Mockito.mock(ServerManager.class); + Mockito.when(sm.isServerOnline(Mockito.any())).thenReturn(true); + MasterServices ms = Mockito.mock(MasterServices.class); + Mockito.when(ms.getServerManager()).thenReturn(sm); + Configuration configuration = HBaseConfiguration.create(); + Mockito.when(ms.getConfiguration()).thenReturn(configuration); + AssignmentManager am = new AssignmentManager(ms); + + MasterProcedureEnv env = Mockito.mock(MasterProcedureEnv.class); + MasterProcedureScheduler ps = Mockito.mock(MasterProcedureScheduler.class); + Mockito.when(env.getProcedureScheduler()).thenReturn(ps); + Mockito.when(env.getAssignmentManager()).thenReturn(am); + Mockito.when(env.getMasterServices()).thenReturn(ms); + RSProcedureDispatcher rsd = new RSProcedureDispatcher(ms); + Mockito.when(env.getRemoteDispatcher()).thenReturn(rsd); + + AssignProcedure ap = new AssignProcedure(ri, sn); + RegionStates.RegionStateNode rsn = am.getRegionStates().getOrCreateRegionStateNode(ri); + rsn.setRegionLocation(sn); + rsn.setState(RegionState.State.OPENING); + IOException ioe = new HBaseIOException(); + ap.remoteCallFailed(env, sn, ioe); + // Assert that we handled the failure because state was OPENING. State should be OFFLINE now. + assertTrue(rsn.isInState(RegionState.State.OFFLINE)); + // Set an unexpected state. + rsn.setState(RegionState.State.OPEN); + ap.remoteCallFailed(env, sn, ioe); + // Should be in same unexpected state because we did nothing. + assertTrue(rsn.isInState(RegionState.State.OPEN)); + } } -- 2.16.3