.../apache/hadoop/yarn/conf/YarnConfiguration.java | 29 +++++++++++++++++++- .../src/main/resources/yarn-default.xml | 16 +++++++++++ .../resourcemanager/RMActiveServiceContext.java | 15 ++++++++++ .../yarn/server/resourcemanager/RMContext.java | 3 ++ .../yarn/server/resourcemanager/RMContextImpl.java | 13 +++++++++ .../server/resourcemanager/ResourceManager.java | 10 +++++++ .../server/resourcemanager/rmapp/RMAppImpl.java | 3 ++ .../rmapp/attempt/RMAppAttemptImpl.java | 3 ++ .../server/resourcemanager/rmnode/RMNodeImpl.java | 32 ++++++++++++++++++++++ .../rmapp/attempt/TestRMAppAttemptTransitions.java | 8 ++++-- 10 files changed, 129 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index a8a87ad8c9c..24eb89c5e66 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -558,7 +558,19 @@ public static boolean isAclEnabled(Configuration conf) { public static final String RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS = RM_PREFIX + "rm.container-allocation.expiry-interval-ms"; public static final int DEFAULT_RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS = 600000; - + + /** How long to wait for AM container allocation default 15 minutes */ + public static final String AM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS = RM_PREFIX + + "am.container-allocation.expiry-interval-ms"; + public static final int DEFAULT_AM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS = 900000; + + /** + * AM Container Allocation Expire Enabled or not. By default enabled = false. + */ + public static final String AMCONTAINER_ALLOCATIONEXPIRY_ENABLED = RM_PREFIX + + "am.container-allocation.expiry.enabled"; + public static final boolean DEFAULT_AMCONTAINER_ALLOCATIONEXPIRY_ENABLED = false; + /** Path to file with nodes to include.*/ public static final String RM_NODES_INCLUDE_FILE_PATH = RM_PREFIX + "nodes.include-path"; @@ -4786,4 +4798,19 @@ public static long getSkipNodeInterval(Configuration conf) { public static void main(String[] args) throws Exception { new YarnConfiguration(new Configuration()).writeXml(System.out); } + + /** + * Returns true if am allocation expiry is enabled else false + * @param conf + * @return boolean + */ + public static boolean isAMContainerAllocationExpiryEnabled( + Configuration conf) { + if (null == conf) { + return false; + } + return conf.getBoolean( + YarnConfiguration.AMCONTAINER_ALLOCATIONEXPIRY_ENABLED, + YarnConfiguration.DEFAULT_AMCONTAINER_ALLOCATIONEXPIRY_ENABLED); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index f7d9fc1d2b0..fb502d45c83 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1138,6 +1138,22 @@ yarn.resourcemanager.rm.container-allocation.expiry-interval-ms 600000 + + + + AM container is not allocated for specified time then kill the application + + yarn.resourcemanager.am.container-allocation.expiry.enabled + false + + + + + The expiry interval for AM container + + yarn.resourcemanager.am.container-allocation.expiry-interval-ms + 900000 + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMActiveServiceContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMActiveServiceContext.java index f1b0c794031..c19528b58ee 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMActiveServiceContext.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMActiveServiceContext.java @@ -30,6 +30,7 @@ import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirer; import org.apache.hadoop.yarn.nodelabels.NodeAttributesManager; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.SystemCredentialsForAppsProto; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMDelegatedNodeLabelsUpdater; @@ -91,6 +92,7 @@ private AMLivelinessMonitor amFinishingMonitor; private RMStateStore stateStore = null; private ContainerAllocationExpirer containerAllocationExpirer; + private AMContainerAllocationExpirer amContainerAllocationExpirer; private DelegationTokenRenewer delegationTokenRenewer; private AMRMTokenSecretManager amRMTokenSecretManager; private RMContainerTokenSecretManager containerTokenSecretManager; @@ -604,4 +606,17 @@ public Long getTokenSequenceNo() { public void incrTokenSequenceNo() { this.tokenSequenceNo.incrementAndGet(); } + + @Private + @Unstable + public AMContainerAllocationExpirer getAMContainerAllocationExpirer() { + return this.amContainerAllocationExpirer; + } + + @Private + @Unstable + void setAMContainerAllocationExpirer( + AMContainerAllocationExpirer amContainerAllocationExpirer) { + this.amContainerAllocationExpirer = amContainerAllocationExpirer; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java index 55420bd9270..c7e0aa31f69 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContext.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AMLivelinessMonitor; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.monitor.RMAppLifetimeMonitor; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; @@ -203,4 +204,6 @@ void setMultiNodeSortingManager( long getTokenSequenceNo(); void incrTokenSequenceNo(); + + AMContainerAllocationExpirer getAMContainerAllocationExpirer(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java index 7f10138494e..7675f5a51d4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMContextImpl.java @@ -47,6 +47,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AMLivelinessMonitor; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.monitor.RMAppLifetimeMonitor; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; @@ -678,4 +679,16 @@ public long getTokenSequenceNo() { public void incrTokenSequenceNo() { this.activeServiceContext.incrTokenSequenceNo(); } + + @VisibleForTesting + public void setAMContainerAllocationExpirer( + AMContainerAllocationExpirer amContainerAllocationExpirer) { + activeServiceContext + .setAMContainerAllocationExpirer(amContainerAllocationExpirer); + } + + @Override + public AMContainerAllocationExpirer getAMContainerAllocationExpirer() { + return activeServiceContext.getAMContainerAllocationExpirer(); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index c315b335415..3f34b713082 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -95,6 +95,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.monitor.RMAppLifetimeMonitor; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType; @@ -655,6 +656,7 @@ protected static void validateConfigs(Configuration conf) { private EventHandler schedulerDispatcher; private ApplicationMasterLauncher applicationMasterLauncher; private ContainerAllocationExpirer containerAllocationExpirer; + private AMContainerAllocationExpirer amContainerAllocationExpirer; private ResourceManager rm; private boolean fromActive = false; private StandByTransitionRunnable standByTransitionRunnable; @@ -676,6 +678,14 @@ protected void serviceInit(Configuration configuration) throws Exception { addService(containerAllocationExpirer); rmContext.setContainerAllocationExpirer(containerAllocationExpirer); + if (YarnConfiguration.isAMContainerAllocationExpiryEnabled(conf)) { + amContainerAllocationExpirer = new AMContainerAllocationExpirer( + rmContext); + addService(amContainerAllocationExpirer); + ((RMContextImpl) rmContext) + .setAMContainerAllocationExpirer(amContainerAllocationExpirer); + } + AMLivelinessMonitor amLivelinessMonitor = createAMLivelinessMonitor(); addService(amLivelinessMonitor); rmContext.setAMLivelinessMonitor(amLivelinessMonitor); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 94f7bb97bfe..ff97bf72afc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -77,6 +77,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirerUtil; import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistManager; import org.apache.hadoop.yarn.server.resourcemanager.blacklist.DisabledBlacklistManager; import org.apache.hadoop.yarn.server.resourcemanager.blacklist.SimpleBlacklistManager; @@ -1339,6 +1340,8 @@ public FinalSavingTransition(Object transitionToDo, @Override public void transition(RMAppImpl app, RMAppEvent event) { + AMContainerAllocationExpirerUtil.getInstance().unregister(app.rmContext, + app); app.rememberTargetTransitionsAndStoreState(event, transitionToDo, targetedFinalState, stateToBeStored); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 2d6de3750e9..ae9e6b6af6f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -72,6 +72,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.ClusterMetrics; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirerUtil; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEvent; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEventType; import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistManager; @@ -1232,6 +1233,8 @@ public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, ClusterMetrics.getMetrics().addAMContainerAllocationDelay( allocationDelay); appAttempt.storeAttempt(); + AMContainerAllocationExpirerUtil.getInstance() + .unregister(appAttempt.rmContext, appAttempt); return RMAppAttemptState.ALLOCATED_SAVING; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index cec9915e0d1..2540865a68a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -74,6 +74,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.NodesListManagerEventType; import org.apache.hadoop.yarn.server.resourcemanager.NodesListManager; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirerUtil; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl; @@ -1365,6 +1366,18 @@ public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) { LOG.info("Node " + rmNode.nodeId + " reported UNHEALTHY with details: " + remoteNodeHealthStatus.getHealthReport()); + List runningApps = rmNode.getRunningApps(); + Iterator iterator = runningApps.iterator(); + while (iterator.hasNext()) { + ApplicationId appId = iterator.next(); + if (!isAMLaunchedInSameRMNode(rmNode, appId)) { + iterator.remove(); + } + } + AMContainerAllocationExpirerUtil.getInstance() + .registerWithAMContainerAllocationExpirer(rmNode.context, + runningApps); + // if a node in decommissioning receives an unhealthy report, // it will stay in decommissioning. if (isNodeDecommissioning) { @@ -1401,6 +1414,25 @@ public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) { return initialState; } + + /** + * Return true if AM launched in the same rmNode Else return false + * + * @param rmNode + * @param appId + * @return + */ + private boolean isAMLaunchedInSameRMNode(RMNodeImpl rmNode, + ApplicationId appId) { + RMApp rmApp = rmNode.context.getRMApps().get(appId); + if (null != rmApp + && null != rmApp.getCurrentAppAttempt().getMasterContainer() + && rmNode.getNodeID().equals( + rmApp.getCurrentAppAttempt().getMasterContainer().getNodeId())) { + return true; + } + return false; + } } public static class StatusUpdateWhenUnHealthyTransition implements diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java index 4e5ff3f7687..eff69392689 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java @@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContextImpl; import org.apache.hadoop.yarn.server.resourcemanager.ahs.RMApplicationHistoryWriter; +import org.apache.hadoop.yarn.server.resourcemanager.amcontainer.AMContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEvent; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEventType; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.ApplicationMasterLauncher; @@ -245,7 +246,9 @@ public void setUp() throws Exception { SecurityUtil.setAuthenticationMethod(authMethod, conf); UserGroupInformation.setConfiguration(conf); InlineDispatcher rmDispatcher = new InlineDispatcher(); - + + AMContainerAllocationExpirer amContainerAllocationExpirer = mock( + AMContainerAllocationExpirer.class); ContainerAllocationExpirer containerAllocationExpirer = mock(ContainerAllocationExpirer.class); amLivelinessMonitor = mock(AMLivelinessMonitor.class); @@ -260,7 +263,8 @@ public void setUp() throws Exception { new RMContainerTokenSecretManager(conf), nmTokenManager, clientToAMTokenManager); - + ((RMContextImpl) rmContext) + .setAMContainerAllocationExpirer(amContainerAllocationExpirer); store = mock(RMStateStore.class); ((RMContextImpl) rmContext).setStateStore(store); publisher = mock(SystemMetricsPublisher.class);