diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java index 75c285d9a35..d31416d3c1b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java @@ -373,14 +373,19 @@ static String getRefreshURL(String url) { /** * Throw {@link RuntimeException} inside a thread of - * {@link ResourceManager} with HA enabled and check if the - * {@link ResourceManager} is transited to standby state. + * {@link ResourceManager} with HA enabled and check + * {@link RMCriticalThreadUncaughtExceptionHandler} instance. + * + * Used {@link ExitUtil} class to avoid jvm exit through + * {@code System.exit(-1)}. * * @throws InterruptedException if any */ @Test public void testUncaughtExceptionHandlerWithHAEnabled() throws InterruptedException { + ExitUtil.disableSystemHalt(); + conf.set(YarnConfiguration.RM_CLUSTER_ID, "yarn-test-cluster"); conf.set(YarnConfiguration.RM_ZK_ADDRESS, hostPort); cluster.init(conf); @@ -393,6 +398,8 @@ public void testUncaughtExceptionHandlerWithHAEnabled() final RMCriticalThreadUncaughtExceptionHandler exHandler = new RMCriticalThreadUncaughtExceptionHandler( resourceManager.getRMContext()); + final RMCriticalThreadUncaughtExceptionHandler spyRTEHandler = + spy(exHandler); // Create a thread and throw a RTE inside it final RuntimeException rte = new RuntimeException("TestRuntimeException"); @@ -403,11 +410,12 @@ public void run() { } }); testThread.setName("TestThread"); - testThread.setUncaughtExceptionHandler(exHandler); + testThread.setUncaughtExceptionHandler(spyRTEHandler); + assertSame(spyRTEHandler, testThread.getUncaughtExceptionHandler()); testThread.start(); testThread.join(); - verifyRMTransitionToStandby(resourceManager); + verify(spyRTEHandler).uncaughtException(testThread, rte); } /** @@ -423,7 +431,7 @@ public void run() { @Test public void testUncaughtExceptionHandlerWithoutHA() throws InterruptedException { - ExitUtil.disableSystemExit(); + ExitUtil.disableSystemHalt(); // Create a MockRM and start it ResourceManager resourceManager = new MockRM(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/CuratorBasedElectorService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/CuratorBasedElectorService.java index d7485f531b5..e10e643025e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/CuratorBasedElectorService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/CuratorBasedElectorService.java @@ -79,6 +79,7 @@ protected void serviceStop() throws Exception { super.serviceStop(); } + @SuppressWarnings(value = "unchecked") @Override public void rejoinElection() { try { @@ -86,7 +87,11 @@ public void rejoinElection() { Thread.sleep(1000); initAndStartLeaderLatch(); } catch (Exception e) { - LOG.info("Fail to re-join election.", e); + String errorMessage = "Fail to re-join election."; + LOG.error(errorMessage, e); + rm.getRMContext().getDispatcher().getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.EMBEDDED_ELECTOR_FAILED, e, + errorMessage)); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 3427667a33a..337f3a106dc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -901,10 +901,30 @@ public void handle(RMFatalEvent event) { LOG.error("Received " + event); if (HAUtil.isHAEnabled(getConfig())) { - // If we're in an HA config, the right answer is always to go into - // standby. - LOG.warn("Transitioning the resource manager to standby."); - handleTransitionToStandByInNewThread(); + // If we're in an HA config, to be conservative, shutdown by default, + // and only transition to standby for known safe failures which will + // never cause RM cannot work in standby state. + // For other failures, they may unsafe to transition to standby, such as + // RM failed to rejoin the leader election in previous standby + // transition, i.e. the EMBEDDED_ELECTOR_FAILED, and then if we continue + // to transition to standby, the transition will be skipped due to the + // standByTransitionRunnable has already run previously, and thus RM + // will never try to rejoin the leader election again, and thus hangs + // forever (not retry or crash). + switch (event.getType()) { + case STATE_STORE_FENCED: + case STATE_STORE_OP_FAILED: + case TRANSITION_TO_ACTIVE_FAILED: + LOG.warn("Transitioning the resource manager to standby. " + + "Caused by: " + event); + handleTransitionToStandByInNewThread(); + break; + default: + LOG.fatal("Shutting down the resource manager because it may not " + + "continue to work even in standby state. " + + "Caused by: " + event); + ExitUtil.halt(-1, event.toString()); + } } else { // If we're stand-alone, we probably want to shut down, but the if and // how depends on the event. @@ -913,7 +933,7 @@ public void handle(RMFatalEvent event) { LOG.fatal("State store fenced even though the resource manager " + "is not configured for high availability. Shutting down this " + "resource manager to protect the integrity of the state store."); - ExitUtil.terminate(1, event.getExplanation()); + ExitUtil.halt(-1, event.toString()); break; case STATE_STORE_OP_FAILED: if (YarnConfiguration.shouldRMFailFast(getConfig())) { @@ -921,7 +941,7 @@ public void handle(RMFatalEvent event) { "store operation failed, and the resource manager is " + "configured to fail fast. See the yarn.fail-fast and " + "yarn.resourcemanager.fail-fast properties."); - ExitUtil.terminate(1, event.getExplanation()); + ExitUtil.halt(-1, event.toString()); } else { LOG.warn("Ignoring state store operation failure because the " + "resource manager is not configured to fail fast. See the " + @@ -931,7 +951,7 @@ public void handle(RMFatalEvent event) { break; default: LOG.fatal("Shutting down the resource manager."); - ExitUtil.terminate(1, event.getExplanation()); + ExitUtil.halt(-1, event.toString()); } } }