diff --git hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java index dbea90f..3b54e15 100644 --- hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java +++ hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java @@ -168,6 +168,15 @@ public String getNodeManagerVersion() { public Set getNodeLabels() { return RMNodeLabelsManager.EMPTY_STRING_SET; } + + @Override + public long getUntrackedTimeStamp() { + return 0; + } + + @Override + public void setUntrackedTimeStamp(final long timeStamp) { + } } public static RMNode newNodeInfo(String rackName, String hostName, diff --git hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java index 356b8bd..d95841f 100644 --- hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java +++ hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java @@ -158,4 +158,13 @@ public String getNodeManagerVersion() { public Set getNodeLabels() { return RMNodeLabelsManager.EMPTY_STRING_SET; } + + @Override + public final long getUntrackedTimeStamp() { + return 0; + } + + @Override + public void setUntrackedTimeStamp(final long timeStamp) { + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 3f85642..6163e59 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -621,6 +621,15 @@ private static void addDeprecatedKeys() { "NONE"; /** + * Timeout(msec) for an untracked node to remain in shutdown or decommissioned + * state. + */ + public static final String RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC = + RM_PREFIX + "node-removal-untracked.timeout-ms"; + public static final int DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC + = 60000; + + /** * RM proxy users' prefix */ public static final String RM_PROXY_USER_PREFIX = RM_PREFIX + "proxyuser."; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 55abd12..20e478c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1694,4 +1694,17 @@ yarn.nodemanager.webapp.cross-origin.enabled false + + + + The least amount of time(msec.) an inactive (decommissioned or shutdown) node can + stay in the nodes list of the resourcemanager after being declared untracked. + A node is marked untracked if and only if it is absent from both include and + exclude nodemanager lists on the RM. All inactive nodes are checked twice per + timeout interval or every 10 minutes, whichever is lesser, and marked appropriately. + The same is done when refreshNodes command (graceful or otherwise) is invoked. + + yarn.resourcemanager.node-removal-untracked.timeout-ms + 60000 + diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java index 46fb3af..30c02e6 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java @@ -21,8 +21,11 @@ import java.io.IOException; import java.util.Collection; import java.util.Collections; +import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import java.util.Timer; +import java.util.TimerTask; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -31,6 +34,7 @@ import org.apache.hadoop.net.Node; import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.HostsFileReader; +import org.apache.hadoop.util.Time; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.EventHandler; @@ -51,6 +55,7 @@ EventHandler { private static final Log LOG = LogFactory.getLog(NodesListManager.class); + public static final int MAX_REMOVAL_CHECK_INTERVAL = 600000; private HostsFileReader hostsReader; private Configuration conf; @@ -61,6 +66,8 @@ private String includesFile; private String excludesFile; + private Timer removalTimer; + private int nodeRemovalCheckInterval; public NodesListManager(RMContext rmContext) { super(NodesListManager.class.getName()); @@ -87,9 +94,52 @@ protected void serviceInit(Configuration conf) throws Exception { } catch (IOException ioe) { disableHostsFileReader(ioe); } + + final int nodeRemovalTimeout = + conf.getInt( + YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, + YarnConfiguration. + DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC); + nodeRemovalCheckInterval = (Math.min(nodeRemovalTimeout/ 2, + MAX_REMOVAL_CHECK_INTERVAL)); + removalTimer = new Timer("Node Removal Timer"); + + removalTimer.schedule(new TimerTask() { + @Override + public void run() { + long now = Time.monotonicNow(); + for (Map.Entry entry + : rmContext.getInactiveRMNodes().entrySet()) { + String nodeId = entry.getKey(); + RMNode rmNode = entry.getValue(); + if (isUntrackedNode(rmNode.getHostName())) { + if (rmNode.getUntrackedTimeStamp() == 0) { + rmNode.setUntrackedTimeStamp(now); + } else if (now - rmNode.getUntrackedTimeStamp() + > nodeRemovalTimeout) { + RMNode result = rmContext.getInactiveRMNodes().remove(nodeId); + if (result != null) { + ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics(); + clusterMetrics.decrDecommisionedNMs(); + LOG.info("Removed " + result.getHostName() + + " from inactive nodes list"); + } + } + } else { + rmNode.setUntrackedTimeStamp(0); + } + } + } + }, nodeRemovalCheckInterval, nodeRemovalCheckInterval); + super.serviceInit(conf); } + @Override + protected void serviceStop() { + removalTimer.cancel(); + } + private void printConfiguredHosts() { if (!LOG.isDebugEnabled()) { return; @@ -135,6 +185,7 @@ public void refreshNodes(Configuration yarnConf) throws IOException, new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION)); } } + updateInactiveNodes(); } private void setDecomissionedNMs() { @@ -287,6 +338,42 @@ protected void build() { } } + public final int getNodeRemovalCheckInterval() { + return nodeRemovalCheckInterval; + } + + public final boolean isUntrackedNode(final String hostName) { + boolean untracked; + String ip = NetUtils.normalizeHostName(hostName); + + synchronized (hostsReader) { + Set hostsList = hostsReader.getHosts(); + Set excludeList = hostsReader.getExcludedHosts(); + untracked = !hostsList.isEmpty() + && !hostsList.contains(hostName) && !hostsList.contains(ip) + && !excludeList.contains(hostName) && !excludeList.contains(ip); + } + return untracked; + } + + @VisibleForTesting + public void setNodeRemovalCheckInterval(final int interval) { + this.nodeRemovalCheckInterval = interval; + } + + private void updateInactiveNodes() { + long now = Time.monotonicNow(); + for (Map.Entry entry + : rmContext.getInactiveRMNodes().entrySet()) { + String nodeId = entry.getKey(); + RMNode rmNode = entry.getValue(); + if (isUntrackedNode(nodeId) + && rmNode.getUntrackedTimeStamp() == 0) { + rmNode.setUntrackedTimeStamp(now); + } + } + } + /** * A Node instance needed upon startup for populating RMNode Map. * It only knows its hostname/ip. diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java index fd17153..9b8adaf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java @@ -79,7 +79,7 @@ acceptedStates.contains(NodeState.LOST) || acceptedStates.contains(NodeState.REBOOTED)) { for (RMNode rmNode : context.getInactiveRMNodes().values()) { - if (acceptedStates.contains(rmNode.getState())) { + if ((rmNode != null) && acceptedStates.contains(rmNode.getState())) { results.add(rmNode); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index b859a7c..818585a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -273,7 +273,8 @@ public RegisterNodeManagerResponse registerNodeManager( } // Check if this node is a 'valid' node - if (!this.nodesListManager.isValidNode(host)) { + if (!this.nodesListManager.isValidNode(host) + || this.nodesListManager.isUntrackedNode(host)) { String message = "Disallowed NodeManager from " + host + ", Sending SHUTDOWN signal to the NodeManager."; @@ -366,7 +367,8 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) NodeId nodeId = remoteNodeStatus.getNodeId(); // 1. Check if it's a valid (i.e. not excluded) node - if (!this.nodesListManager.isValidNode(nodeId.getHost())) { + if (!this.nodesListManager.isValidNode(nodeId.getHost()) + || this.nodesListManager.isUntrackedNode(nodeId.getHost())) { String message = "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId.getHost(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java index ed6875b..95ba1e4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java @@ -147,4 +147,8 @@ * @return labels in this node */ public Set getNodeLabels(); + + long getUntrackedTimeStamp(); + + void setUntrackedTimeStamp(long timer); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index 3210524..868d5b9 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -35,6 +35,7 @@ import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.net.Node; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.util.Time; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; @@ -105,6 +106,7 @@ private String healthReport; private long lastHealthReportTime; private String nodeManagerVersion; + private long timeStamp; /* set of containers that have just launched */ private final Set launchedContainers = @@ -238,6 +240,7 @@ public RMNodeImpl(NodeId nodeId, RMContext context, String hostName, this.healthReport = "Healthy"; this.lastHealthReportTime = System.currentTimeMillis(); this.nodeManagerVersion = nodeManagerVersion; + this.timeStamp = 0; this.latestNodeHeartBeatResponse.setResponseId(0); @@ -767,7 +770,11 @@ public void transition(RMNodeImpl rmNode, RMNodeEvent event) { LOG.info("Deactivating Node " + rmNode.nodeId + " as it is now " + finalState); rmNode.context.getInactiveRMNodes().put(rmNode.nodeId.getHost(), rmNode); - + if (rmNode.getState() == NodeState.DECOMMISSIONED + && rmNode.context.getNodesListManager().isUntrackedNode( + rmNode.hostName)) { + rmNode.setUntrackedTimeStamp(Time.monotonicNow()); + } //Update the metrics rmNode.updateMetricsForDeactivatedNode(initialState, finalState); } @@ -816,7 +823,7 @@ public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) { // needed for log-aggregation to finish long after the apps are gone. if (UserGroupInformation.isSecurityEnabled()) { rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications( - statusEvent.getKeepAliveAppIds()); + statusEvent.getKeepAliveAppIds()); } return NodeState.RUNNING; @@ -936,4 +943,13 @@ private void handleContainerStatus(List containerStatuses) { } } + @Override + public final long getUntrackedTimeStamp() { + return this.timeStamp; + } + + @Override + public void setUntrackedTimeStamp(final long ts) { + this.timeStamp = ts; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java index 003318c..a967005 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java @@ -226,6 +226,15 @@ public long getLastHealthReportTime() { } return CommonNodeLabelsManager.EMPTY_STRING_SET; } + + @Override + public long getUntrackedTimeStamp() { + return 0; + } + + @Override + public void setUntrackedTimeStamp(long timeStamp) { + } }; private static RMNode buildRMNode(int rack, final Resource perNode, diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index 83a3934..738062b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -29,6 +29,8 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IOUtils; @@ -45,8 +47,6 @@ import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.event.Dispatcher; -import org.apache.hadoop.yarn.event.DrainDispatcher; import org.apache.hadoop.yarn.event.Event; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; @@ -146,6 +146,7 @@ public void testDecommissionWithIncludeHosts() throws Exception { Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); Assert.assertEquals(metricCount, ClusterMetrics.getMetrics() .getNumDecommisionedNMs()); + rm.stop(); } /** @@ -735,6 +736,170 @@ public void testInitDecommMetricNoRegistration() throws Exception { rm1.stop(); } + /** + * Remove a node from all lists and check if its forgotten + */ + @Test + public void testNodeRemoval() throws Exception { + testNodeRemovalUtil(); + } + + public void refreshNodesOption(Configuration conf) + throws Exception { + rm.getNodesListManager().refreshNodes(conf); + } + + public void testNodeRemovalUtil() throws Exception { + Configuration conf = new Configuration(); + int timeoutValue = 500; + File excludeHostFile = new File(TEMP_DIR + File.separator + + "excludeHostFile.txt"); + conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, ""); + conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, ""); + conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, + timeoutValue); + CountDownLatch latch = new CountDownLatch(1); + rm = new MockRM(conf); + rm.init(conf); + rm.start(); + RMContext rmContext = rm.getRMContext(); + refreshNodesOption(conf); + MockNM nm1 = rm.registerNode("host1:1234", 5120); + MockNM nm2 = rm.registerNode("host2:5678", 10240); + MockNM nm3 = rm.registerNode("localhost:4433", 1024); + ClusterMetrics metrics = ClusterMetrics.getMetrics(); + assert (metrics != null); + + //check all 3 nodes joined in as NORMAL + NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + nodeHeartbeat = nm2.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + nodeHeartbeat = nm3.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + rm.drainEvents(); + Assert.assertEquals("All 3 nodes should be active", + metrics.getNumActiveNMs(), 3); + + //Remove nm2 from include list, should now be shutdown with timer test + String ip = NetUtils.normalizeHostName("localhost"); + writeToHostsFile("host1", ip); + conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile + .getAbsolutePath()); + refreshNodesOption(conf); + nm1.nodeHeartbeat(true); + rm.drainEvents(); + Assert.assertTrue("Node should not be in active node list", + !rmContext.getRMNodes().containsKey(nm2.getNodeId())); + + RMNode rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId().getHost()); + Assert.assertEquals("Node should be in inactive node list", + rmNode.getState(), NodeState.DECOMMISSIONED); + Assert.assertEquals("Active nodes should be 2", + metrics.getNumActiveNMs(), 2); + Assert.assertEquals("Shutdown nodes should be 1", + metrics.getNumDecommisionedNMs(), 1); + + int nodeRemovalTimeout = + conf.getInt( + YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, + YarnConfiguration. + DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC); + int nodeRemovalInterval = + rmContext.getNodesListManager().getNodeRemovalCheckInterval(); + long maxThreadSleeptime = nodeRemovalInterval + nodeRemovalTimeout; + latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS); + + rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId()); + Assert.assertEquals("Node should have been forgotten!", + rmNode, null); + Assert.assertEquals("Shutdown nodes should be 0 now", + metrics.getNumDecommisionedNMs(), 0); + + //Check node removal and re-addition before timer expires + writeToHostsFile("host1", ip, "host2"); + refreshNodesOption(conf); + nm2 = rm.registerNode("host2:5678", 10240); + rm.drainEvents(); + writeToHostsFile("host1", ip); + refreshNodesOption(conf); + rm.drainEvents(); + rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId().getHost()); + Assert.assertEquals("Node should be Decommissioned", + rmNode.getState(), NodeState.DECOMMISSIONED); + Assert.assertEquals("Active nodes should be 2", + metrics.getNumActiveNMs(), 2); + Assert.assertEquals("Shutdown nodes should be 1", + metrics.getNumDecommisionedNMs(), 1); + + //add back the node before timer expires + latch.await(maxThreadSleeptime - 2000, TimeUnit.MILLISECONDS); + writeToHostsFile("host1", ip, "host2"); + refreshNodesOption(conf); + nm2 = rm.registerNode("host2:5678", 10240); + nodeHeartbeat = nm2.nodeHeartbeat(true); + rm.drainEvents(); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + Assert.assertEquals("Shutdown nodes should be 0 now", + metrics.getNumDecommisionedNMs(), 0); + Assert.assertEquals("All 3 nodes should be active", + metrics.getNumActiveNMs(), 3); + + //Decommission this node, check timer doesn't remove it + writeToHostsFile("host1", "host2", ip); + writeToHostsFile(excludeHostFile, "host2"); + conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile + .getAbsolutePath()); + refreshNodesOption(conf); + rm.drainEvents(); + rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId().getHost()); + Assert.assertTrue("Node should be DECOMMISSIONED", + (rmNode.getState() == NodeState.DECOMMISSIONED)); + if (rmNode.getState() == NodeState.DECOMMISSIONED) { + Assert.assertEquals("Decommissioned/ing nodes should be 1 now", + metrics.getNumDecommisionedNMs(), 1); + } + latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS); + + rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId().getHost()); + Assert.assertTrue("Node should be DECOMMISSIONED", + (rmNode.getState() == NodeState.DECOMMISSIONED)); + if (rmNode.getState() == NodeState.DECOMMISSIONED) { + Assert.assertEquals("Decommissioned/ing nodes should be 1 now", + metrics.getNumDecommisionedNMs(), 1); + } + + //Test decommed/ing node that transitions to untracked,timer should remove + writeToHostsFile("host1", ip, "host2"); + writeToHostsFile(excludeHostFile, "host2"); + refreshNodesOption(conf); + nm1.nodeHeartbeat(true); + //nm2.nodeHeartbeat(true); + nm3.nodeHeartbeat(true); + latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS); + rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId().getHost()); + Assert.assertNotEquals("Timer for this node was not canceled!", + rmNode, null); + Assert.assertTrue("Node should be DECOMMISSIONED", + (rmNode.getState() == NodeState.DECOMMISSIONED)); + + writeToHostsFile("host1", ip); + writeToHostsFile(excludeHostFile, ""); + refreshNodesOption(conf); + latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS); + rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId().getHost()); + Assert.assertEquals("Node should have been forgotten!", + rmNode, null); + Assert.assertEquals("Shutdown nodes should be 0 now", + metrics.getNumDecommisionedNMs(), 0); + Assert.assertEquals("Shutdown nodes should be 0 now", + metrics.getNumDecommisionedNMs(), 0); + Assert.assertEquals("Active nodes should be 2", + metrics.getNumActiveNMs(), 2); + + rm.stop(); + } + private void writeToHostsFile(String... hosts) throws IOException { writeToHostsFile(hostFile, hosts); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java index c837450..d5616e2 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java @@ -133,7 +133,6 @@ public void testNMExpiry() throws Exception { String hostname2 = "localhost2"; String hostname3 = "localhost3"; Resource capability = BuilderUtils.newResource(1024, 1); - RegisterNodeManagerRequest request1 = recordFactory .newRecordInstance(RegisterNodeManagerRequest.class); NodeId nodeId1 = NodeId.newInstance(hostname1, 0); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java index f507e17..6dc92b3 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java @@ -267,8 +267,10 @@ public void testNodesQueryStateLost() throws JSONException, Exception { RMNode rmNode = rm.getRMContext().getInactiveRMNodes().get(host); WebServicesTestUtils.checkStringMatch("nodeHTTPAddress", "", info.getString("nodeHTTPAddress")); - WebServicesTestUtils.checkStringMatch("state", rmNode.getState() - .toString(), info.getString("state")); + if (rmNode != null) { + WebServicesTestUtils.checkStringMatch("state", + rmNode.getState().toString(), info.getString("state")); + } } } @@ -298,8 +300,10 @@ public void testSingleNodeQueryStateLost() throws JSONException, Exception { RMNode rmNode = rm.getRMContext().getInactiveRMNodes().get("h2"); WebServicesTestUtils.checkStringMatch("nodeHTTPAddress", "", info.getString("nodeHTTPAddress")); - WebServicesTestUtils.checkStringMatch("state", - rmNode.getState().toString(), info.getString("state")); + if (rmNode != null) { + WebServicesTestUtils.checkStringMatch("state", + rmNode.getState().toString(), info.getString("state")); + } } @Test