diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index d3cec140264..6cc44000170 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1168,6 +1168,15 @@ public static boolean isAclEnabled(Configuration conf) {
public static final int
DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC = 60000;
+ /**
+ * Whether to enable RM to mark inactive nodes as untracked and removed from
+ * nodes list for the YARN cluster without configured include path.
+ */
+ public static final String RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH =
+ RM_PREFIX + "enable-node-untracked-without-include-path";
+ public static final boolean
+ DEFAULT_RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH = false;
+
/**
* RM proxy users' prefix
*/
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index b64efba5dca..b3d86d07601 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -4779,4 +4779,25 @@
yarn.resourcemanager.application-tag-based-placement.force-lowercase
true
+
+
+
+ Whether to enable RM to mark inactive nodes as untracked after the timeout
+ specified by yarn.resourcemanager.node-removal-untracked.timeout-ms and
+ then remove them from nodes list for the YARN cluster without configured
+ include path, which means RM can periodically clear inactive nodes to
+ avoid increasing memory to store these data when enabled, most desired by
+ elastic cloud environment with frequent auto-scaling operations.
+ It works only when the YARN cluster doesn't utilize include file, the key
+ configurations are as follows:
+ yarn.resourcemanager.nodes.exclude-path=/path-to-exclude-file
+ yarn.resourcemanager.nodes.include-path=
+ yarn.resourcemanager.node-removal-untracked.timeout-ms=60000
+ In this situation, the inactive nodes will never be marked as untracked
+ and removed from the nodes list unless this configuration is enabled:
+ yarn.resourcemanager.enable-node-untracked-without-include-path=true
+
+ yarn.resourcemanager.enable-node-untracked-without-include-path
+ false
+
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
index 07d78cb1c1c..ae601b50712 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
@@ -85,6 +85,7 @@
private Timer removalTimer;
private int nodeRemovalCheckInterval;
private Set gracefulDecommissionableNodes;
+ private boolean enableNodeUntrackedWithoutIncludePath;
public NodesListManager(RMContext rmContext) {
super(NodesListManager.class.getName());
@@ -124,6 +125,9 @@ protected void serviceInit(Configuration conf) throws Exception {
disableHostsFileReader(ioe);
}
+ enableNodeUntrackedWithoutIncludePath = conf.getBoolean(
+ YarnConfiguration.RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH,
+ YarnConfiguration.DEFAULT_RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH);
final int nodeRemovalTimeout =
conf.getInt(
YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC,
@@ -605,7 +609,10 @@ public boolean isUntrackedNode(String hostName) {
Set hostsList = hostDetails.getIncludedHosts();
Set excludeList = hostDetails.getExcludedHosts();
- return !hostsList.isEmpty() && !hostsList.contains(hostName)
+ return (!hostsList.isEmpty() || (enableNodeUntrackedWithoutIncludePath
+ && (hostDetails.getIncludesFile() == null
+ || hostDetails.getIncludesFile().isEmpty())))
+ && !hostsList.contains(hostName)
&& !hostsList.contains(ip) && !excludeList.contains(hostName)
&& !excludeList.contains(ip);
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java
index 5e3e67e6eca..6ec0d053547 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java
@@ -18,12 +18,14 @@
package org.apache.hadoop.yarn.server.resourcemanager;
+import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap;
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;
import org.apache.hadoop.net.ServerSocketUtil;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.delegation.web.DelegationTokenIdentifier;
+import org.apache.hadoop.util.Sets;
import org.apache.hadoop.yarn.nodelabels.NodeAttributeStore;
import org.apache.hadoop.yarn.nodelabels.NodeLabelUtil;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
@@ -122,6 +124,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
+import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent;
@@ -3063,4 +3066,86 @@ public void testSystemCredentialsAfterTokenSequenceNoChange()
resourceTrackerService.close();
}
+
+ /**
+ * Decommissioning without pre-configured include hosts file.
+ */
+ @Test
+ public void testDecommissionWithoutIncludeFile() throws Exception {
+ // clear exclude hosts
+ writeToHostsFile(excludeHostFile, "");
+ // init conf:
+ // (1) set untracked removal timeout to 500ms
+ // (2) set exclude path (no include path)
+ // (3) enable node untracked without pre-configured include path
+ Configuration conf = new Configuration();
+ conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC,
+ 500);
+ conf.setBoolean(
+ YarnConfiguration.RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH, true);
+ conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
+ excludeHostFile.getAbsolutePath());
+
+ rm = new MockRM(conf);
+ rm.start();
+ MockNM nm1 = rm.registerNode("host1:1234", 10240);
+ MockNM nm2 = rm.registerNode("host2:1234", 10240);
+ MockNM nm3 = rm.registerNode("host3:1234", 10240);
+ MockNM nm4 = rm.registerNode("host4:1234", 10240);
+ assertEquals(4, rm.getRMContext().getRMNodes().size());
+ assertEquals(0, rm.getRMContext().getInactiveRMNodes().size());
+
+ // decommission nm1 via adding nm1 into exclude hosts
+ RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
+ writeToHostsFile(excludeHostFile, "host1");
+ rm.getNodesListManager().refreshNodes(conf);
+ rm.drainEvents();
+ assertEquals(rmNode1.getState(), NodeState.DECOMMISSIONED);
+ assertEquals(3, rm.getRMContext().getRMNodes().size());
+ assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+ assertEquals(Sets.newHashSet(nm1.getNodeId()),
+ rm.getRMContext().getInactiveRMNodes().keySet());
+
+ // remove nm1 from exclude hosts, so that it will be marked as untracked
+ // and removed from inactive nodes after the timeout
+ writeToHostsFile(excludeHostFile, "");
+ rm.getNodesListManager().refreshNodes(conf);
+ // confirmed that nm1 should be removed from inactive nodes in 1 second
+ GenericTestUtils.waitFor(
+ () -> rm.getRMContext().getInactiveRMNodes().size() == 0, 100, 1000);
+
+ // lost nm2
+ RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
+ rm.getRMContext().getDispatcher().getEventHandler()
+ .handle(new RMNodeEvent(nm2.getNodeId(), RMNodeEventType.EXPIRE));
+ rm.drainEvents();
+ assertEquals(rmNode2.getState(), NodeState.LOST);
+ assertEquals(2, rm.getRMContext().getRMNodes().size());
+ assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+ assertEquals(Sets.newHashSet(nm2.getNodeId()),
+ rm.getRMContext().getInactiveRMNodes().keySet());
+ // confirmed that nm2 should be removed from inactive nodes in 1 second
+ GenericTestUtils.waitFor(
+ () -> rm.getRMContext().getInactiveRMNodes().size() == 0, 100, 1000);
+
+ // shutdown nm3
+ RMNode rmNode3 = rm.getRMContext().getRMNodes().get(nm3.getNodeId());
+ rm.getRMContext().getDispatcher().getEventHandler()
+ .handle(new RMNodeEvent(nm3.getNodeId(), RMNodeEventType.SHUTDOWN));
+ rm.drainEvents();
+ assertEquals(rmNode3.getState(), NodeState.SHUTDOWN);
+ assertEquals(1, rm.getRMContext().getRMNodes().size());
+ assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+ assertEquals(Sets.newHashSet(nm3.getNodeId()),
+ rm.getRMContext().getInactiveRMNodes().keySet());
+ // confirmed that nm3 should be removed from inactive nodes in 1 second
+ GenericTestUtils.waitFor(
+ () -> rm.getRMContext().getInactiveRMNodes().size() == 0, 100, 1000);
+
+ // nm4 is still active node at last
+ assertEquals(Sets.newHashSet(nm4.getNodeId()),
+ rm.getRMContext().getRMNodes().keySet());
+
+ rm.close();
+ }
}