diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 56e72d3..f0722d2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -854,7 +854,13 @@ protected synchronized void completedContainer(RMContainer rmContainer, application.unreserve(rmContainer.getReservedPriority(), node); } else { application.containerCompleted(rmContainer, containerStatus, event); - node.releaseContainer(container); + // node could be null if the thread was waiting for the synchronised lock + if (node != null) { + node.releaseContainer(container); + } else { + LOG.info("Skipping container release on removed node: " + + container.getNodeId()); + } updateRootQueueMetrics(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index 6248e09..d2164de 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -85,11 +85,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; @@ -4959,4 +4961,62 @@ public void testUserAsDefaultQueueWithLeadingTrailingSpaceUserName() assertEquals("root.user1", resourceManager.getRMContext().getRMApps() .get(attId3.getApplicationId()).getQueue()); } + + @Test + public void testCompletedContainerOnBlacklistedNode() throws IOException { + scheduler.init(conf); + scheduler.start(); + scheduler.reinitialize(conf, resourceManager.getRMContext()); + + // Add a node + RMNode node = MockNodes.newNodeInfo(1, Resources.createResource(2048), 2, + "127.0.0.2"); + scheduler.handle(new NodeAddedSchedulerEvent(node)); + + // Create application attempt + ApplicationAttemptId appAttemptId = createAppAttemptId(1, 1); + createMockRMApp(appAttemptId); + scheduler.addApplication(appAttemptId.getApplicationId(), "root.queue1", + "user1", false); + scheduler.addApplicationAttempt(appAttemptId, false, false); + + // Create container request that goes to a specific node. + // Without the 2nd and 3rd request we do not get live containers + List ask1 = new ArrayList<>(); + ResourceRequest request1 = + createResourceRequest(1024, node.getHostName(), 1, 1, true); + ask1.add(request1); + ResourceRequest request2 = + createResourceRequest(1024, node.getRackName(), 1, 1, false); + ask1.add(request2); + ResourceRequest request3 = + createResourceRequest(1024, ResourceRequest.ANY, 1, 1, false); + ask1.add(request3); + + // Perform allocation + scheduler.allocate(appAttemptId, ask1, new ArrayList(), null, + null, null, null); + scheduler.update(); + scheduler.handle(new NodeUpdateSchedulerEvent(node)); + + // Get the allocated containers for the application (list can not be null) + Collection clist = scheduler.getSchedulerApp(appAttemptId) + .getLiveContainers(); + Assert.assertEquals(1, clist.size()); + + // Make sure that we remove the correct node (should never fail) + RMContainer rmc = clist.iterator().next(); + NodeId containerNodeID = rmc.getAllocatedNode(); + assertEquals(node.getNodeID(), containerNodeID); + + // Blacklist node + scheduler.handle(new NodeRemovedSchedulerEvent(node)); + + // Call completedContainer() should not fail even if the node has been + // removed + scheduler.completedContainer(rmc, + SchedulerUtils.createAbnormalContainerStatus(rmc.getContainerId(), + SchedulerUtils.COMPLETED_APPLICATION), + RMContainerEventType.EXPIRE); + } }