diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java index 945e7cbac02..c6746b1a396 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java @@ -129,6 +129,8 @@ .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, RMContainerEventType.ACQUIRED) .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, + RMContainerEventType.LAUNCHED) + .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, RMContainerEventType.RESERVED, new ContainerReservedTransition()) .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, RMContainerEventType.ACQUIRE_UPDATED_CONTAINER, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 0acfca79110..69a662e8e08 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -87,6 +87,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeResourceUpdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ApplicationSchedulingConfig; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ContainerRequest; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement; @@ -672,6 +673,16 @@ public void completedContainer(RMContainer rmContainer, return; } + if (event == RMContainerEventType.KILL && containerStatus.getDiagnostics() + .equals(SchedulerUtils.LOST_CONTAINER)) { + // Delegate to AM if requested + RMApp rmApp = this.rmContext.getRMApps().get(rmContainer.getContainerId() + .getApplicationAttemptId().getApplicationId()); + if (amHandlesNMLoss(rmApp)) { + return; + } + } + if (rmContainer.getExecutionType() == ExecutionType.GUARANTEED) { completedContainerInternal(rmContainer, containerStatus, event); completeOustandingUpdatesWhichAreReserved( @@ -735,10 +746,12 @@ protected abstract void completedContainerInternal(RMContainer rmContainer, protected void releaseContainers(List containers, SchedulerApplicationAttempt attempt) { for (ContainerId containerId : containers) { + RMApp rmApp = this.rmContext.getRMApps().get(attempt + .getApplicationAttemptId().getApplicationId()); RMContainer rmContainer = getRMContainer(containerId); if (rmContainer == null) { if (System.currentTimeMillis() - ResourceManager.getClusterTimeStamp() - < nmExpireInterval) { + < nmExpireInterval || amHandlesNMLoss(rmApp)) { LOG.info(containerId + " doesn't exist. Add the container" + " to the release request cache as it maybe on recovery."); attempt.getPendingRelease().add(containerId); @@ -749,6 +762,9 @@ protected void releaseContainers(List containers, "Trying to release container not owned by app or with invalid id.", attempt.getApplicationId(), containerId, null); } + } else if (amHandlesNMLoss(rmApp)) { + LOG.debug("Adding " + containerId + " to the release request cache."); + attempt.getPendingRelease().add(containerId); } completedContainer(rmContainer, SchedulerUtils.createAbnormalContainerStatus(containerId, @@ -756,6 +772,13 @@ protected void releaseContainers(List containers, } } + private static boolean amHandlesNMLoss(RMApp rmApp) { + return rmApp != null && !"true".equals(rmApp.getApplicationSchedulingEnvs() + .getOrDefault( + ApplicationSchedulingConfig.ENV_KILL_CONTAINER_ON_NM_LOSS, + ApplicationSchedulingConfig.DEFAULT_KILL_CONTAINER_ON_NM_LOSS)); + } + @Override public N getSchedulerNode(NodeId nodeId) { return nodeTracker.getNode(nodeId); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java index f9df2b829b7..905e6c1b6b8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java @@ -70,6 +70,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; @@ -640,6 +641,17 @@ public void containerLaunchedOnNode(ContainerId containerId, new RMNodeCleanContainerEvent(nodeId, containerId)); return; } + if (getPendingRelease().remove(containerId)) { + // release the container + rmContainer.handle( + new RMContainerFinishedEvent(containerId, + SchedulerUtils + .createAbnormalContainerStatus(containerId, + SchedulerUtils.RELEASED_CONTAINER), + RMContainerEventType.RELEASED)); + LOG.info(containerId + " is released by application."); + return; + } rmContainer.handle( new RMContainerEvent(containerId, RMContainerEventType.LAUNCHED)); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java index 06f74de96bc..99741f7d9ca 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java @@ -36,4 +36,11 @@ @InterfaceAudience.Private public static final String ENV_MULTI_NODE_SORTING_POLICY_CLASS = "MULTI_NODE_SORTING_POLICY_CLASS"; + + @InterfaceAudience.Private + public static final String ENV_KILL_CONTAINER_ON_NM_LOSS = + "KILL_CONTAINER_ON_NM_LOSS"; + + @InterfaceAudience.Private + public static final String DEFAULT_KILL_CONTAINER_ON_NM_LOSS = "true"; }