diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java index 7c4f9d7..836fa5c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java @@ -338,6 +338,8 @@ public void recover(RMState state) throws Exception { } else { maxAppAttempts = individualMaxAppAttempts; } + // In work-preserve restart, if attemptCount == maxAttempts, the job still + // needs to be recovered because the last attempt may still be running. if(appState.getAttemptCount() >= maxAppAttempts) { LOG.info("Not recovering application " + appState.getAppId() + " due to recovering attempt is beyond maxAppAttempt limit"); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index 78adf79..db08d0b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -360,7 +360,6 @@ public void testRMRestartOnMaxAppAttempts() throws Exception { Assert.assertNotNull(attemptState); Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId()); - rm1.stop(); // start new RM MockRM rm2 = new MockRM(conf, memStore); @@ -378,7 +377,12 @@ public void testRMRestartOnMaxAppAttempts() throws Exception { Assert.assertNull(rm2.getRMContext().getRMApps() .get(app1.getApplicationId())); - // stop the RM + // verify that app2 is stored, app1 is removed + Assert.assertNotNull(rmAppState.get(app2.getApplicationId())); + Assert.assertNull(rmAppState.get(app1.getApplicationId())); + + // stop the RM + rm1.stop(); rm2.stop(); } }