diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index f1d55a4..93e6d40 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -811,11 +811,9 @@ public void recover(RMState state) { this.storedFinishTime = appState.getFinishTime(); this.startTime = appState.getStartTime(); this.callerContext = appState.getCallerContext(); - // If interval > 0, some attempts might have been deleted. - if (submissionContext.getAttemptFailuresValidityInterval() > 0) { - this.firstAttemptIdInStateStore = appState.getFirstAttemptId(); - this.nextAttemptId = firstAttemptIdInStateStore; - } + // Some attempts might have been deleted. + this.firstAttemptIdInStateStore = appState.getFirstAttemptId(); + this.nextAttemptId = firstAttemptIdInStateStore; // send the ATS create Event sendATSCreateEvent(this, this.startTime); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 8aefe9f..525113c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -2113,4 +2113,5 @@ private void setFinishTime(long finishTime) { public void updateAMLaunchDiagnostics(String amLaunchDiagnostics) { this.amLaunchDiagnostics = amLaunchDiagnostics; } + } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java index 9ceeffb..4398fe4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java @@ -37,8 +37,12 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; @@ -316,4 +320,41 @@ protected void doSecureLogin() throws IOException { } } + @Test(timeout = 60000) + public void testRMRestartOnMissingAttempts() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2); + MemoryRMStateStore memStore = new MemoryRMStateStore(); + memStore.init(conf); + // start RM + MockRM rm1 = new MockRM(conf, memStore); + rm1.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService()); + nm1.registerNode(); + int CONTAINER_MEMORY = 1024; + // create app and launch the AM + RMApp app0 = rm1.submitApp(CONTAINER_MEMORY); + for (int i = 0; i < 5; i++) { + MockAM am0 = MockRM.launchAM(app0, rm1, nm1); + am0.registerAppAttempt(); + CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler(); + // get scheduler app + FiCaSchedulerApp schedulerAppAttempt = cs.getSchedulerApplications() + .get(app0.getApplicationId()).getCurrentAppAttempt(); + // kill app0-attempt + cs.killContainer(schedulerAppAttempt.getRMContainer( + app0.getCurrentAppAttempt().getMasterContainer().getId())); + } + rm1.killApp(app0.getApplicationId()); + // start RM2 + try { + MockRM rm2 = new MockRM(conf, memStore); + rm2.start(); + Assert.assertTrue("RM start successfully", true); + } catch (Exception e) { + Assert.fail("RM should start with out any issue"); + } + } + }