diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java index 5c94ef4..942ec81 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java @@ -91,7 +91,11 @@ public int getNumDecommisionedNMs() { public void incrDecommisionedNMs() { numDecommissionedNMs.incr(); } - + + public void setDecommisionedNMs(int num) { + numDecommissionedNMs.set(num); + } + public void decrDecommisionedNMs() { numDecommissionedNMs.decr(); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java index 4249980..bdb4fc5 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java @@ -70,6 +70,7 @@ protected void serviceInit(Configuration conf) throws Exception { conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH) ); + setDecomissionedNMsMetrics(); printConfiguredHosts(); } catch (IOException ioe) { LOG.warn("Failed to init hostsReader, disabling", ioe); @@ -114,10 +115,16 @@ public void refreshNodes(Configuration yarnConf) throws IOException { YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH)); hostsReader.refresh(); + setDecomissionedNMsMetrics(); printConfiguredHosts(); } } + private void setDecomissionedNMsMetrics() { + Set excludeList = hostsReader.getExcludedHosts(); + ClusterMetrics.getMetrics().setDecommisionedNMs(excludeList.size()); + } + public boolean isValidNode(String hostName) { synchronized (hostsReader) { Set hostsList = hostsReader.getHosts(); @@ -140,6 +147,10 @@ public int getUnusableNodes(Collection unUsableNodes) { return unusableRMNodesConcurrentSet.size(); } + public HostsFileReader getHostsReader() { + return hostsReader; + } + @Override public void handle(NodesListManagerEvent event) { RMNode eventNode = event.getNode(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index 52bc285..91b345d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -424,9 +424,19 @@ private void updateMetricsForDeactivatedNode(NodeState initialState, break; } + // Decomissioned NMs equals to the nodes missing in include list (if + // include list not empty) or the nodes listed in excluded list. + // Decomissioned node metrics as per exclude list is set upfront when the + // exclude list is read so that RM restart can also reflect the + // decomissioned metrics. + // Decomissioned node metrics as per include list is incremented in this + // transition. switch (finalState) { case DECOMMISSIONED: - metrics.incrDecommisionedNMs(); + if (!context.getNodesListManager().getHostsReader().getExcludedHosts() + .contains(hostName)) { + metrics.incrDecommisionedNMs(); + } break; case LOST: metrics.incrNumLostNMs(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index d50f0d7..3dfe01b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -23,6 +23,8 @@ import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.net.InetSocketAddress; import java.net.UnknownHostException; @@ -38,7 +40,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; +import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; import org.apache.hadoop.security.SecurityUtil; @@ -85,6 +89,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.log4j.Level; @@ -96,6 +102,9 @@ public class TestRMRestart { + private final static File TEMP_DIR = new File(System.getProperty( + "test.build.data", "/tmp"), "decommision"); + private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt"); private YarnConfiguration conf; // Fake rmAddr for token-renewal @@ -1666,6 +1675,48 @@ private void assertQueueMetrics(QueueMetrics qm, int appsSubmitted, appsCompleted + appsCompletedCarryOn); } + @Test + public void testDecomissionedNMsMetricsOnRMRestart() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile + .getAbsolutePath()); + writeToHostsFile(""); + MockRM rm1 = new MockRM(conf); + rm1.start(); + Assert + .assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); + String ip = NetUtils.normalizeHostName("localhost"); + // Add hosts to exclude list. + writeToHostsFile("host2", ip); + MockRM rm2 = new MockRM(conf); + // restart RM. + rm2.start(); + Assert + .assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); + rm1.stop(); + rm2.stop(); + } + + private void writeToHostsFile(String... hosts) throws IOException { + if (!hostFile.exists()) { + TEMP_DIR.mkdirs(); + hostFile.createNewFile(); + } + FileOutputStream fStream = null; + try { + fStream = new FileOutputStream(hostFile); + for (int i = 0; i < hosts.length; i++) { + fStream.write(hosts[i].getBytes()); + fStream.write("\n".getBytes()); + } + } finally { + if (fStream != null) { + IOUtils.closeStream(fStream); + fStream = null; + } + } + } + public class TestMemoryRMStateStore extends MemoryRMStateStore { int count = 0; public int updateApp = 0; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index abb21ed..803e95f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -150,7 +150,6 @@ public void testDecommissionWithExcludeHosts() throws Exception { MockNM nm3 = rm.registerNode("localhost:4433", 1024); int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); - NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); @@ -161,18 +160,17 @@ public void testDecommissionWithExcludeHosts() throws Exception { writeToHostsFile("host2", ip); rm.getNodesListManager().refreshNodes(conf); + checkDecommissionedNMCount(rm, metricCount + 2); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); - checkDecommissionedNMCount(rm, ++metricCount); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); - checkDecommissionedNMCount(rm, ++metricCount); } /**