diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java index 647dfa333fe..8b58a10fb82 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java @@ -114,6 +114,7 @@ protected void serviceInit(Configuration conf) throws Exception { this.hostsReader = createHostsFileReader(this.includesFile, this.excludesFile); setDecomissionedNMs(); + setUnregisteredNMs(); printConfiguredHosts(); } catch (YarnException ex) { disableHostsFileReader(ex); @@ -257,6 +258,20 @@ private void setDecomissionedNMs() { } } + private void setUnregisteredNMs() { + Set includeList = hostsReader.getHosts(); + for (final String host : includeList) { + if (!hostsReader.getExcludedHosts().contains(host)) { + NodeId nodeId = createUnknownNodeId(host); + RMNodeImpl rmNode = new RMNodeImpl(nodeId, + rmContext, host, -1, -1, new UnknownNode(host), + Resource.newInstance(0, 0), "unknown"); + rmContext.getInactiveRMNodes().put(nodeId, rmNode); + rmNode.handle(new RMNodeEvent(nodeId, RMNodeEventType.SHUTDOWN)); + } + } + } + // Handle excluded nodes based on following rules: // Recommission DECOMMISSIONED or DECOMMISSIONING nodes no longer excluded; // Gracefully decommission excluded nodes that are not already @@ -527,6 +542,7 @@ private void disableHostsFileReader(Exception ex) { this.hostsReader = createHostsFileReader(this.includesFile, this.excludesFile); setDecomissionedNMs(); + setUnregisteredNMs(); } catch (IOException ioe2) { // Should *never* happen this.hostsReader = null; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index d33ee44de4d..0d03e1b1b90 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -214,6 +214,9 @@ .addTransition(NodeState.NEW, NodeState.DECOMMISSIONED, RMNodeEventType.DECOMMISSION, new DeactivateNodeTransition(NodeState.DECOMMISSIONED)) + .addTransition(NodeState.NEW, NodeState.SHUTDOWN, + RMNodeEventType.SHUTDOWN, + new DeactivateNodeTransition(NodeState.SHUTDOWN)) //Transitions from RUNNING state .addTransition(NodeState.RUNNING, @@ -362,6 +365,8 @@ .addTransition(NodeState.SHUTDOWN, NodeState.SHUTDOWN, RMNodeEventType.RESOURCE_UPDATE, new UpdateNodeResourceWhenUnusableTransition()) + .addTransition(NodeState.SHUTDOWN, NodeState.RUNNING, + RMNodeEventType.STARTED, new AddNodeTransition()) .addTransition(NodeState.SHUTDOWN, NodeState.SHUTDOWN, RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM, new AddContainersToBeRemovedFromNMTransition()) @@ -853,7 +858,11 @@ public void transition(RMNodeImpl rmNode, RMNodeEvent event) { previousRMNode = rmNode.context.getInactiveRMNodes().remove(unknownNodeId); if (previousRMNode != null) { - ClusterMetrics.getMetrics().decrDecommisionedNMs(); + if (previousRMNode.getState().equals(NodeState.SHUTDOWN)) { + ClusterMetrics.getMetrics().decrNumShutdownNMs(); + } else { + ClusterMetrics.getMetrics().decrDecommisionedNMs(); + } } // Increment activeNodes explicitly because this is a new node. ClusterMetrics.getMetrics().incrNumActiveNodes(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index 78828f2b823..1e99cdc50de 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -236,6 +236,64 @@ public void testDecommissionWithIncludeHosts() throws Exception { rm.stop(); } + @Test + public void testIncludeHostsWithNoRegister() throws Exception { + + writeToHostsFile("localhost", "host1", "host2"); + Configuration conf = new Configuration(); + conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile + .getAbsolutePath()); + + rm = new MockRM(conf); + rm.start(); + rm.drainEvents(); + + Assert.assertEquals("NEW nodes should be in SHUTDOWN State first!", + 3, ClusterMetrics.getMetrics().getNumShutdownNMs()); + MockNM nm1 = rm.registerNode("host1:1234", 5120); + MockNM nm2 = rm.registerNode("host2:5678", 10240); + MockNM nm3 = rm.registerNode("localhost:4433", 1024); + + ClusterMetrics metrics = ClusterMetrics.getMetrics(); + assert(metrics != null); + int metricCount = metrics.getNumDecommisionedNMs(); + + NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + nodeHeartbeat = nm2.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + nodeHeartbeat = nm3.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + Assert.assertEquals( + "NEW/SHUTDOWN nodes should be in RUNNING State after registration!", + 3, ClusterMetrics.getMetrics().getNumActiveNMs()); + Assert.assertEquals( + "There should be no SHUTDOWN nodes", + 0, ClusterMetrics.getMetrics().getNumShutdownNMs()); + // To test that IPs also work + String ip = NetUtils.normalizeHostName("localhost"); + writeToHostsFile("host1", ip); + + rm.getNodesListManager().refreshNodes(conf); + + checkShutdownNMCount(rm, ++metricCount); + + nodeHeartbeat = nm1.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + Assert + .assertEquals(1, ClusterMetrics.getMetrics().getNumShutdownNMs()); + + nodeHeartbeat = nm2.nodeHeartbeat(true); + Assert.assertTrue("Node is not decommisioned.", NodeAction.SHUTDOWN + .equals(nodeHeartbeat.getNodeAction())); + + nodeHeartbeat = nm3.nodeHeartbeat(true); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); + Assert.assertEquals(metricCount, ClusterMetrics.getMetrics() + .getNumShutdownNMs()); + rm.stop(); + } + /** * Decommissioning using a pre-configured exclude hosts file */