diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java index 945e7cbac02..c6746b1a396 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java @@ -129,6 +129,8 @@ .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, RMContainerEventType.ACQUIRED) .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, + RMContainerEventType.LAUNCHED) + .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, RMContainerEventType.RESERVED, new ContainerReservedTransition()) .addTransition(RMContainerState.RUNNING, RMContainerState.RUNNING, RMContainerEventType.ACQUIRE_UPDATED_CONTAINER, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 0acfca79110..29c3700a10b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -87,6 +87,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeResourceUpdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ApplicationSchedulingConfig; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ContainerRequest; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement; @@ -672,10 +673,25 @@ public void completedContainer(RMContainer rmContainer, return; } + if (event == RMContainerEventType.KILL && containerStatus.getDiagnostics() + .equals(SchedulerUtils.LOST_CONTAINER)) { + // Delegate to AM if requested + RMApp rmApp = this.rmContext.getRMApps().get(rmContainer.getContainerId() + .getApplicationAttemptId().getApplicationId()); + if (amHandlesNMLoss(rmApp)) { + return; + } + } + if (rmContainer.getExecutionType() == ExecutionType.GUARANTEED) { completedContainerInternal(rmContainer, containerStatus, event); completeOustandingUpdatesWhichAreReserved( rmContainer, containerStatus, event); + SchedulerApplicationAttempt schedulerAttempt = + getCurrentAttemptForContainer(rmContainer.getContainerId()); + if (schedulerAttempt != null) { + schedulerAttempt.removeRMContainer(rmContainer.getContainerId()); + } } else { ContainerId containerId = rmContainer.getContainerId(); // Inform the container @@ -756,6 +772,13 @@ protected void releaseContainers(List containers, } } + private static boolean amHandlesNMLoss(RMApp rmApp) { + return rmApp != null && !"true".equals(rmApp.getApplicationSchedulingEnvs() + .getOrDefault( + ApplicationSchedulingConfig.ENV_KILL_CONTAINER_ON_NM_LOSS, + ApplicationSchedulingConfig.DEFAULT_KILL_CONTAINER_ON_NM_LOSS)); + } + @Override public N getSchedulerNode(NodeId nodeId) { return nodeTracker.getNode(nodeId); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java index 06f74de96bc..99741f7d9ca 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/ApplicationSchedulingConfig.java @@ -36,4 +36,11 @@ @InterfaceAudience.Private public static final String ENV_MULTI_NODE_SORTING_POLICY_CLASS = "MULTI_NODE_SORTING_POLICY_CLASS"; + + @InterfaceAudience.Private + public static final String ENV_KILL_CONTAINER_ON_NM_LOSS = + "KILL_CONTAINER_ON_NM_LOSS"; + + @InterfaceAudience.Private + public static final String DEFAULT_KILL_CONTAINER_ON_NM_LOSS = "true"; }