diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 399e16e531b..2d942e7d64e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -1220,11 +1220,18 @@ private void updateSchedulerHealth(long now, FiCaSchedulerNode node, } @VisibleForTesting - protected synchronized void allocateContainersToNode(FiCaSchedulerNode node) { + public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext.isSchedulerReadyForAllocatingContainers()) { return; } + + if (!nodes.containsKey(node.getNodeID())) { + LOG.info("Skipping scheduling as the node " + node.getNodeID() + + " has been removed"); + return; + } + // reset allocation and reservation stats before we start doing any work updateSchedulerHealth(lastNodeUpdateTime, node, new CSAssignment(Resources.none(), NodeType.NODE_LOCAL)); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java index a663bdb9775..e7eb788e286 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java @@ -3875,4 +3875,42 @@ private void waitforNMRegistered(ResourceScheduler scheduler, int nodecount, } } } + + @Test + public void testSchedulingOnRemovedNode() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + conf.setBoolean( + CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, + false); + + MockRM rm = new MockRM(conf); + rm.start(); + RMApp app = rm.submitApp(100); + rm.drainEvents(); + + MockNM nm1 = rm.registerNode("127.0.0.1:1234", 10240, 10); + MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1); + + //remove nm2 to keep am alive + MockNM nm2 = rm.registerNode("127.0.0.1:1235", 10240, 10); + + am.allocate(ResourceRequest.ANY, 2048, 1, null); + + CapacityScheduler scheduler = + (CapacityScheduler) rm.getRMContext().getScheduler(); + FiCaSchedulerNode node = scheduler.getAllNodes().get(nm2.getNodeId()); + scheduler.handle(new NodeRemovedSchedulerEvent( + rm.getRMContext().getRMNodes().get(nm2.getNodeId()))); + // schedulerNode is removed, try allocate a container + scheduler.allocateContainersToNode(node); + + AppAttemptRemovedSchedulerEvent appRemovedEvent1 = + new AppAttemptRemovedSchedulerEvent( + am.getApplicationAttemptId(), + RMAppAttemptState.FINISHED, false); + scheduler.handle(appRemovedEvent1); + rm.stop(); + } }