diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java index 4bf6a78..cd8dbc0 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java @@ -52,6 +52,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.RMCriticalThreadUncaughtExceptionHandler; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.RMFatalEvent; +import org.apache.hadoop.yarn.server.resourcemanager.RMFatalEventType; import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServer; import org.apache.hadoop.yarn.webapp.YarnWebParams; import org.junit.After; @@ -181,7 +183,8 @@ public void testAutomaticFailover() // so it transitions to standby. ResourceManager rm = cluster.getResourceManager( cluster.getActiveRMIndex()); - rm.handleTransitionToStandByInNewThread(); + rm.getRMContext().getDispatcher().getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_FENCED, "test")); int maxWaitingAttempts = 2000; while (maxWaitingAttempts-- > 0 ) { if (rm.getRMContext().getHAServiceState() == HAServiceState.STANDBY) { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java index c5c6087..09c1930 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java @@ -47,12 +47,8 @@ public RMCriticalThreadUncaughtExceptionHandler(RMContext rmContext) { public void uncaughtException(Thread t, Throwable e) { LOG.fatal("Critical thread " + t.getName() + " crashed!", e); - if (HAUtil.isHAEnabled(rmContext.getYarnConfiguration())) { - rmContext.getResourceManager().handleTransitionToStandByInNewThread(); - } else { - rmContext.getDispatcher().getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.CRITICAL_THREAD_CRASH, - new Exception(e))); - } + rmContext.getDispatcher().getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.CRITICAL_THREAD_CRASH, + new Exception(e))); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java index b6f6b3c..69b285f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java @@ -23,6 +23,7 @@ @InterfaceAudience.Private public enum RMFatalEventType { // Source <- Store + STATE_STORE_FENCED, STATE_STORE_OP_FAILED, // Source <- Embedded Elector diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 58e4077..dc81e74 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -812,15 +812,50 @@ protected void createPolicyMonitors() { } @Private - public static class RMFatalEventDispatcher - implements EventHandler { - + private class RMFatalEventDispatcher implements EventHandler { @Override public void handle(RMFatalEvent event) { - LOG.fatal("Received a " + RMFatalEvent.class.getName() + " of type " + - event.getType().name() + ". Cause:\n" + event.getCause()); + LOG.warn("Received an RMFatalEvent of type " + event.getType().name() + + ". Cause: " + event.getCause()); + + switch(event.getType()) { + case STATE_STORE_FENCED: + if (HAUtil.isHAEnabled(getConfig())) { + LOG.warn("State store fenced. Transitioning the resource manager " + + "to standby."); + handleTransitionToStandByInNewThread(); + } else { + LOG.fatal("State store fenced even though the resource manager " + + "is not configured for high availability. Shutting down this " + + "resource manager to protect the integrity of the state store."); + ExitUtil.terminate(1, event.getCause()); + } + break; + case CRITICAL_THREAD_CRASH: + if (HAUtil.isHAEnabled(getConfig())) { + LOG.warn("A critical thread has exited unexpectedly. " + + "Transitioning resource manager to standby."); + handleTransitionToStandByInNewThread(); + } else { + LOG.fatal("A critical thread has exited unexpectedly. " + + "Shutting down the resource manager."); + ExitUtil.terminate(1, event.getCause()); + } + default: + if (YarnConfiguration.shouldRMFailFast(getConfig())) { + LOG.fatal("State store operation error. Shutting down the " + + "resource manager because YARN is configured to fail fast. " + + "See the yarn.fail-fast and yarn.resourcemanager.fail-fast " + + "properties."); + ExitUtil.terminate(1, event.getCause()); + } else { + LOG.warn("State store operation error. Transitioning the " + + "resource manager to standby."); + handleTransitionToStandByInNewThread(); + } - ExitUtil.terminate(1, event.getCause()); + break; + } } } @@ -828,7 +863,7 @@ public void handle(RMFatalEvent event) { * Transition to standby state in a new thread. The transition operation is * asynchronous to avoid deadlock caused by cyclic dependency. */ - public void handleTransitionToStandByInNewThread() { + private void handleTransitionToStandByInNewThread() { Thread standByTransitionThread = new Thread(activeServices.standByTransitionRunnable); standByTransitionThread.setName("StandByTransitionThread"); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index 5e3cf22..975847c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -1129,18 +1129,18 @@ private boolean notifyStoreOperationFailedInternal( Exception failureCause) { boolean isFenced = false; LOG.error("State store operation failed ", failureCause); + if (HAUtil.isHAEnabled(getConfig())) { - LOG.warn("State-store fenced ! Transitioning RM to standby"); + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_FENCED, + failureCause)); isFenced = true; - resourceManager.handleTransitionToStandByInNewThread(); - } else if (YarnConfiguration.shouldRMFailFast(getConfig())) { - LOG.fatal("Fail RM now due to state-store error!"); + } else { rmDispatcher.getEventHandler().handle( new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause)); - } else { - LOG.warn("Skip the state-store error."); } + return isFenced; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java index 89b9e2b..cb278c0 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java @@ -43,6 +43,7 @@ public synchronized void removeRMDelegationTokenState( store.init(conf); ResourceManager mockRM = mock(ResourceManager.class); store.setResourceManager(mockRM); + store.setRMDispatcher(new RMStateStoreTestBase.TestDispatcher()); RMDelegationTokenIdentifier mockTokenId = mock(RMDelegationTokenIdentifier.class); store.removeRMDelegationToken(mockTokenId); @@ -58,6 +59,7 @@ public synchronized void removeRMDelegationToken( }; store.init(conf); store.setResourceManager(mockRM); + store.setRMDispatcher(new RMStateStoreTestBase.TestDispatcher()); store.removeRMDelegationToken(mockTokenId); assertTrue("RMStateStore should have been in fenced state", store.isFencedState());