diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java index e5e5537..5f7d01e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java @@ -25,13 +25,39 @@ public class CMgrCompletedContainersEvent extends ContainerManagerEvent { private final List containerToCleanup; - - public CMgrCompletedContainersEvent(List containersToCleanup) { + private final Reason reason; + + public CMgrCompletedContainersEvent(List containersToCleanup, + Reason reason) { super(ContainerManagerEventType.FINISH_CONTAINERS); this.containerToCleanup = containersToCleanup; + this.reason = reason; } public List getContainersToCleanup() { return this.containerToCleanup; } + + public Reason getReason() { + return reason; + } + + public static enum Reason { + /** + * Container is killed as NodeManager is shutting down + */ + ON_SHUTDOWN, + + /** + * Container is killed as the Nodemanager is re-syncing with the + * ResourceManager + */ + ON_NODEMANAGER_RESYNC, + + /** + * Container is killed on request by the ResourceManager + */ + BY_RESOURCEMANAGER + } + } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 79b9d7a..6353b66 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -226,7 +226,8 @@ protected void resyncWithRM() { public void run() { LOG.info("Notifying ContainerManager to block new container-requests"); containerManager.setBlockNewContainerRequests(true); - containerManager.cleanUpApplications(NodeManagerEventType.RESYNC); + LOG.info("Cleaning up running containers on resync"); + containerManager.cleanupContainers(NodeManagerEventType.RESYNC); ((NodeStatusUpdaterImpl) nodeStatusUpdater ).rebootNodeStatusUpdater(); } }.start(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index b52f9d1..d6af3fe 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -501,7 +501,8 @@ public void run() { .getContainersToCleanup(); if (!containersToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( - new CMgrCompletedContainersEvent(containersToCleanup)); + new CMgrCompletedContainersEvent(containersToCleanup, + CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER)); } List appsToCleanup = response.getApplicationsToCleanup(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index d158b43..557d82b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -374,6 +374,69 @@ public void cleanUpApplications(NodeManagerEventType eventType) { } } + @SuppressWarnings("unchecked") + public void cleanupContainers(NodeManagerEventType eventType) { + Map containers = context.getContainers(); + if (containers.isEmpty()) { + return; + } + LOG.info("Containers still running on " + NodeManagerEventType.RESYNC + " : " + + containers.keySet()); + + List containerIds = + new ArrayList(containers.keySet()); + + LOG.info("Waiting for containers to be killed"); + + switch (eventType) { + case SHUTDOWN: + this.handle( + new CMgrCompletedContainersEvent(containerIds, + CMgrCompletedContainersEvent.Reason.ON_SHUTDOWN)); + long waitStartTime = System.currentTimeMillis(); + while (!containers.isEmpty() + && System.currentTimeMillis() - waitStartTime + < waitForContainersOnShutdownMillis) { + try { + //To remove done containers in NM context + nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext(); + Thread.sleep(1000); + } catch (InterruptedException ex) { + LOG.warn("Interrupted while sleeping on container kill on shutdown", + ex); + } + } + break; + case RESYNC: + this.handle( + new CMgrCompletedContainersEvent(containerIds, + CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC)); + while (!containers.isEmpty()) { + try { + Thread.sleep(1000); + nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext(); + } catch (InterruptedException ex) { + LOG.warn("Interrupted while sleeping on container kill on resync", + ex); + } + } + break; + default: + throw new YarnRuntimeException("Get an unknown NodeManagerEventType: " + + eventType); + } + + // All containers killed + if (containers.isEmpty()) { + LOG.info("All containers in DONE state"); + } else { + LOG.info("Done waiting for containers to be killed. Still alive: " + + containers.keySet()); + } + } + + + // Get the remoteUGI corresponding to the api call. protected UserGroupInformation getRemoteUgi() throws YarnException { @@ -850,7 +913,7 @@ public void handle(ContainerManagerEvent event) { break; default: throw new YarnRuntimeException( - "Get an unknown ContainerManagerEvent type: " + event.getType()); + "Got an unknown ContainerManagerEvent type: " + event.getType()); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java index 3e0846b..440f331 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java @@ -102,7 +102,7 @@ public void testKillContainersOnResync() throws IOException, } catch (BrokenBarrierException e) { } Assert.assertEquals(2, ((TestNodeManager1) nm).getNMRegistrationCount()); - + Assert.assertFalse(nm.getNMContext().getApplications().isEmpty()); Assert.assertFalse(assertionFailedInThread.get()); nm.stop();