diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java index d7797cc..90d7b51 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java @@ -121,7 +121,6 @@ public void refreshNodes(Configuration yarnConf) throws IOException, this.conf, includesFile), excludesFile.isEmpty() ? null : this.rmContext.getConfigurationProvider() .getConfigurationInputStream(this.conf, excludesFile)); - setDecomissionedNMsMetrics(); printConfiguredHosts(); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index f0ae826..1123a98 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -460,22 +460,9 @@ private void updateMetricsForDeactivatedNode(NodeState initialState, break; } - // Decomissioned NMs equals to the nodes missing in include list (if - // include list not empty) or the nodes listed in excluded list. - // DecomissionedNMs as per exclude list is set upfront when the - // exclude list is read so that RM restart can also reflect the - // decomissionedNMs. Note that RM is still not able to know decomissionedNMs - // as per include list after it restarts as they are known when those nodes - // come for registration. - // DecomissionedNMs as per include list is incremented in this transition. switch (finalState) { case DECOMMISSIONED: - Set ecludedHosts = - context.getNodesListManager().getHostsReader().getExcludedHosts(); - if (!ecludedHosts.contains(hostName) - && !ecludedHosts.contains(NetUtils.normalizeHostName(hostName))) { metrics.incrDecommisionedNMs(); - } break; case LOST: metrics.incrNumLostNMs(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index caa5647..0b0a7cb 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -77,6 +77,8 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.YarnApplicationState; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.event.DrainDispatcher; import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; import org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; @@ -1833,10 +1835,16 @@ public void testDecomissionedNMsMetricsOnRMRestart() throws Exception { conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath()); writeToHostsFile(""); - MockRM rm1 = new MockRM(conf); + final DrainDispatcher dispatcher = new DrainDispatcher(); + MockRM rm1 = new MockRM(conf) { + @Override + protected Dispatcher createDispatcher() { + return dispatcher; + } + }; rm1.start(); - rm1.registerNode("localhost:1234", 8000); - rm1.registerNode("host2:1234", 8000); + MockNM nm1 = rm1.registerNode("localhost:1234", 8000); + MockNM nm2 = rm1.registerNode("host2:1234", 8000); Assert .assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); String ip = NetUtils.normalizeHostName("localhost"); @@ -1845,6 +1853,15 @@ public void testDecomissionedNMsMetricsOnRMRestart() throws Exception { // refresh nodes rm1.getNodesListManager().refreshNodes(conf); + + NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); + Assert + .assertTrue(NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); + nodeHeartbeat = nm2.nodeHeartbeat(true); + Assert.assertTrue("The decommisioned metrics are not updated", + NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); + + dispatcher.await(); Assert .assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index 877a122..3733aee 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -157,25 +157,32 @@ public void testDecommissionWithExcludeHosts() throws Exception { .getAbsolutePath()); writeToHostsFile(""); - rm = new MockRM(conf); + final DrainDispatcher dispatcher = new DrainDispatcher(); + rm = new MockRM(conf) { + @Override + protected Dispatcher createDispatcher() { + return dispatcher; + } + }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); MockNM nm3 = rm.registerNode("localhost:4433", 1024); + dispatcher.await(); int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + dispatcher.await(); // To test that IPs also work String ip = NetUtils.normalizeHostName("localhost"); writeToHostsFile("host2", ip); rm.getNodesListManager().refreshNodes(conf); - checkDecommissionedNMCount(rm, metricCount + 2); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); @@ -186,6 +193,20 @@ public void testDecommissionWithExcludeHosts() throws Exception { nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); + dispatcher.await(); + checkDecommissionedNMCount(rm, metricCount + 2); + + writeToHostsFile(""); + rm.getNodesListManager().refreshNodes(conf); + + nm3 = rm.registerNode("localhost:4433", 1024); + dispatcher.await(); + nodeHeartbeat = nm3.nodeHeartbeat(true); + dispatcher.await(); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + // decommissined node is 1 since 1 node is rejoined after updating exclude + // file + checkDecommissionedNMCount(rm, metricCount + 1); } /**