diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/NodeState.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/NodeState.java index ff1ca48..c9b110c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/NodeState.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/NodeState.java @@ -37,13 +37,16 @@ UNHEALTHY, /** Node is out of service */ - DECOMMISSIONED, + DECOMMISSIONED, /** Node has not sent a heartbeat for some configured time threshold*/ LOST, /** Node has rebooted */ - REBOOTED; + REBOOTED, + + /** Node decommission is in progress */ + DECOMMISSION_IN_PROGRESS; public boolean isUnusable() { return (this == UNHEALTHY || this == DECOMMISSIONED || this == LOST); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RefreshNodesRequest.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RefreshNodesRequest.java index 66f0605..7350280 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RefreshNodesRequest.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RefreshNodesRequest.java @@ -20,6 +20,7 @@ import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceAudience.Public; +import org.apache.hadoop.classification.InterfaceStability.Evolving; import org.apache.hadoop.classification.InterfaceStability.Stable; import org.apache.hadoop.yarn.util.Records; @@ -32,4 +33,16 @@ public static RefreshNodesRequest newInstance() { RefreshNodesRequest request = Records.newRecord(RefreshNodesRequest.class); return request; } + + @Public + @Evolving + public static RefreshNodesRequest newInstance(long timeout) { + RefreshNodesRequest request = Records.newRecord(RefreshNodesRequest.class); + request.setTimeout(timeout); + return request; + } + + public abstract long getTimeout(); + + public abstract void setTimeout(long timeout); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto index 900e349..f3af1eb 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto @@ -36,6 +36,7 @@ message RefreshQueuesResponseProto { } message RefreshNodesRequestProto { + optional int64 timeout = 1; } message RefreshNodesResponseProto { } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto index 4e29d2f..61e039e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto @@ -213,6 +213,7 @@ enum NodeStateProto { NS_DECOMMISSIONED = 4; NS_LOST = 5; NS_REBOOTED = 6; + NS_DECOMMISSION_IN_PROGRESS = 7; } message NodeIdProto { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java index 4642add..68ec376 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java @@ -75,6 +75,8 @@ "No cluster node-labels are specified"; private static final String NO_MAPPING_ERR_MSG = "No node-to-labels mappings are specified"; + private static final String INVALID_TIMEOUT_ERR_MSG = + "Invalid timeout specified : "; protected final static Map ADMIN_USAGE = ImmutableMap.builder() @@ -82,7 +84,7 @@ "Reload the queues' acls, states and scheduler specific " + "properties. \n\t\tResourceManager will reload the " + "mapred-queues configuration file.")) - .put("-refreshNodes", new UsageInfo("", + .put("-refreshNodes", new UsageInfo("[-g [timeout in ms]]", "Refresh the hosts information at the ResourceManager.")) .put("-refreshSuperUserGroupsConfiguration", new UsageInfo("", "Refresh superuser proxy groups mappings")) @@ -201,7 +203,7 @@ private static void printHelp(String cmd, boolean isHAEnabled) { summary.append("The full syntax is: \n\n" + "yarn rmadmin" + " [-refreshQueues]" + - " [-refreshNodes]" + + " [-refreshNodes [-g [timeout in ms]]]" + " [-refreshSuperUserGroupsConfiguration]" + " [-refreshUserToGroupsMappings]" + " [-refreshAdminAcls]" + @@ -271,11 +273,12 @@ private int refreshQueues() throws IOException, YarnException { return 0; } - private int refreshNodes() throws IOException, YarnException { + private int refreshNodes(long timeout) throws IOException, YarnException { // Refresh the nodes ResourceManagerAdministrationProtocol adminProtocol = createAdminProtocol(); RefreshNodesRequest request = recordFactory.newRecordInstance(RefreshNodesRequest.class); + request.setTimeout(timeout); adminProtocol.refreshNodes(request); return 0; } @@ -517,7 +520,7 @@ public int run(String[] args) throws Exception { // verify that we have enough command line parameters // if ("-refreshAdminAcls".equals(cmd) || "-refreshQueues".equals(cmd) || - "-refreshNodes".equals(cmd) || "-refreshServiceAcl".equals(cmd) || + "-refreshServiceAcl".equals(cmd) || "-refreshUserToGroupsMappings".equals(cmd) || "-refreshSuperUserGroupsConfiguration".equals(cmd)) { if (args.length != 1) { @@ -530,7 +533,21 @@ public int run(String[] args) throws Exception { if ("-refreshQueues".equals(cmd)) { exitCode = refreshQueues(); } else if ("-refreshNodes".equals(cmd)) { - exitCode = refreshNodes(); + if (args.length == 1) { + exitCode = refreshNodes(0); + } else if (args.length == 3) { + // if the graceful timeout specified + if ("-g".equals(args[1])) { + long timeout = validateTimeout(args[2]); + exitCode = refreshNodes(timeout); + } else { + printUsage(cmd, isHAEnabled); + return -1; + } + } else { + printUsage(cmd, isHAEnabled); + return -1; + } } else if ("-refreshUserToGroupsMappings".equals(cmd)) { exitCode = refreshUserToGroupsMappings(); } else if ("-refreshSuperUserGroupsConfiguration".equals(cmd)) { @@ -598,6 +615,19 @@ public int run(String[] args) throws Exception { return exitCode; } + private long validateTimeout(String strTimeout) { + long timeout; + try { + timeout = Long.parseLong(strTimeout); + } catch (NumberFormatException ex) { + throw new IllegalArgumentException(INVALID_TIMEOUT_ERR_MSG + strTimeout); + } + if (timeout < 0) { + throw new IllegalArgumentException(INVALID_TIMEOUT_ERR_MSG + timeout); + } + return timeout; + } + @Override public void setConf(Configuration conf) { if (conf != null) { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRMAdminCLI.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRMAdminCLI.java index c22494c..fec933e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRMAdminCLI.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRMAdminCLI.java @@ -176,6 +176,25 @@ public void testRefreshNodes() throws Exception { verify(admin).refreshNodes(any(RefreshNodesRequest.class)); } + @Test(timeout = 500) + public void testRefreshNodesWithGracefulTimeout() throws Exception { + String[] args = { "-refreshNodes", "-g", "1000" }; + assertEquals(0, rmAdminCLI.run(args)); + verify(admin).refreshNodes(any(RefreshNodesRequest.class)); + + //invalid graceful timeout parameter + String[] invalidArgs = { "-refreshNodes", "-ginvalid", "invalid" }; + assertEquals(-1, rmAdminCLI.run(invalidArgs)); + + //invalid timeout + String[] invalidTimeoutArgs = { "-refreshNodes", "-g", "invalid" }; + assertEquals(-1, rmAdminCLI.run(invalidTimeoutArgs)); + + // negative timeout + String[] negativeTimeoutArgs = { "-refreshNodes", "-g", "-1000" }; + assertEquals(-1, rmAdminCLI.run(negativeTimeoutArgs)); + } + @Test(timeout=500) public void testGetGroups() throws Exception { when(admin.getGroupsForUser(eq("admin"))).thenReturn( @@ -284,7 +303,7 @@ public void testHelp() throws Exception { assertTrue(dataOut .toString() .contains( - "yarn rmadmin [-refreshQueues] [-refreshNodes] [-refreshSuper" + + "yarn rmadmin [-refreshQueues] [-refreshNodes [-g [timeout in ms]]] [-refreshSuper" + "UserGroupsConfiguration] [-refreshUserToGroupsMappings] " + "[-refreshAdminAcls] [-refreshServiceAcl] [-getGroup" + " [username]] [[-addToClusterNodeLabels [label1,label2,label3]]" + @@ -299,7 +318,7 @@ public void testHelp() throws Exception { assertTrue(dataOut .toString() .contains( - "-refreshNodes: Refresh the hosts information at the " + + "-refreshNodes [-g [timeout in ms]]: Refresh the hosts information at the " + "ResourceManager.")); assertTrue(dataOut.toString().contains( "-refreshUserToGroupsMappings: Refresh user-to-groups mappings")); @@ -327,7 +346,7 @@ public void testHelp() throws Exception { testError(new String[] { "-help", "-refreshQueues" }, "Usage: yarn rmadmin [-refreshQueues]", dataErr, 0); testError(new String[] { "-help", "-refreshNodes" }, - "Usage: yarn rmadmin [-refreshNodes]", dataErr, 0); + "Usage: yarn rmadmin [-refreshNodes [-g [timeout in ms]]]", dataErr, 0); testError(new String[] { "-help", "-refreshUserToGroupsMappings" }, "Usage: yarn rmadmin [-refreshUserToGroupsMappings]", dataErr, 0); testError( @@ -364,7 +383,7 @@ public void testHelp() throws Exception { assertEquals(0, rmAdminCLIWithHAEnabled.run(args)); oldOutPrintStream.println(dataOut); String expectedHelpMsg = - "yarn rmadmin [-refreshQueues] [-refreshNodes] [-refreshSuper" + + "yarn rmadmin [-refreshQueues] [-refreshNodes [-g [timeout in ms]]] [-refreshSuper" + "UserGroupsConfiguration] [-refreshUserToGroupsMappings] " + "[-refreshAdminAcls] [-refreshServiceAcl] [-getGroup" + " [username]] [[-addToClusterNodeLabels [label1,label2,label3]]" + diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RefreshNodesRequestPBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RefreshNodesRequestPBImpl.java index 2cea95a..96900e8 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RefreshNodesRequestPBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RefreshNodesRequestPBImpl.java @@ -49,6 +49,16 @@ public RefreshNodesRequestProto getProto() { } @Override + public long getTimeout() { + return getProto().getTimeout(); + } + + @Override + public void setTimeout(long timeout) { + builder.setTimeout(timeout); + } + + @Override public int hashCode() { return getProto().hashCode(); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index 6180995..343e3c8 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -381,7 +381,12 @@ public RefreshNodesResponse refreshNodes(RefreshNodesRequest request) Configuration conf = getConfiguration(new Configuration(false), YarnConfiguration.YARN_SITE_CONFIGURATION_FILE); - rmContext.getNodesListManager().refreshNodes(conf); + long decommissionTimeout = request.getTimeout(); + if (decommissionTimeout > 0) { + rmContext.getNodesListManager().refreshNodes(conf, decommissionTimeout); + } else { + rmContext.getNodesListManager().refreshNodes(conf); + } RMAuditLogger.logSuccess(user.getShortUserName(), argName, "AdminService"); return recordFactory.newRecordInstance(RefreshNodesResponse.class); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java index 786bf8c..022092f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java @@ -107,6 +107,30 @@ private void printConfiguredHosts() { public void refreshNodes(Configuration yarnConf) throws IOException, YarnException { + refreshHostsReader(yarnConf); + for (NodeId nodeId: rmContext.getRMNodes().keySet()) { + if (!isValidNode(nodeId.getHost())) { + this.rmContext.getDispatcher().getEventHandler().handle( + new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION)); + } + } + } + + public void refreshNodes(Configuration yarnConf, long timeout) + throws IOException, YarnException { + refreshHostsReader(yarnConf); + for (NodeId nodeId : rmContext.getRMNodes().keySet()) { + if (!isValidNode(nodeId.getHost())) { + RMNodeEvent event = new RMNodeEvent(nodeId, + RMNodeEventType.DECOMMISSION_WITH_TIMEOUT); + event.setTimeout(timeout); + this.rmContext.getDispatcher().getEventHandler().handle(event); + } + } + } + + private void refreshHostsReader(Configuration yarnConf) throws IOException, + YarnException { synchronized (hostsReader) { if (null == yarnConf) { yarnConf = new YarnConfiguration(); @@ -126,13 +150,6 @@ public void refreshNodes(Configuration yarnConf) throws IOException, .getConfigurationInputStream(this.conf, excludesFile)); printConfiguredHosts(); } - - for (NodeId nodeId: rmContext.getRMNodes().keySet()) { - if (!isValidNode(nodeId.getHost())) { - this.rmContext.getDispatcher().getEventHandler().handle( - new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION)); - } - } } private void setDecomissionedNMsMetrics() { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEvent.java index 9ecb366..90c37cd 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEvent.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEvent.java @@ -24,6 +24,7 @@ public class RMNodeEvent extends AbstractEvent { private final NodeId nodeId; + private long timeout; public RMNodeEvent(NodeId nodeId, RMNodeEventType type) { super(type); @@ -33,4 +34,12 @@ public RMNodeEvent(NodeId nodeId, RMNodeEventType type) { public NodeId getNodeId() { return this.nodeId; } + + public void setTimeout(long timeout) { + this.timeout = timeout; + } + + public long getTimeout() { + return timeout; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEventType.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEventType.java index b4d0b8b..8340423 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEventType.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeEventType.java @@ -24,6 +24,7 @@ // Source: AdminService DECOMMISSION, + DECOMMISSION_WITH_TIMEOUT, // Source: AdminService, ResourceTrackerService RESOURCE_UPDATE, diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index c556b80..30b3aa2 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -149,6 +149,9 @@ .addTransition(NodeState.RUNNING, NodeState.DECOMMISSIONED, RMNodeEventType.DECOMMISSION, new DeactivateNodeTransition(NodeState.DECOMMISSIONED)) + .addTransition(NodeState.RUNNING, NodeState.DECOMMISSION_IN_PROGRESS, + RMNodeEventType.DECOMMISSION_WITH_TIMEOUT, + new DecommissionInProgressNodeTransition()) .addTransition(NodeState.RUNNING, NodeState.LOST, RMNodeEventType.EXPIRE, new DeactivateNodeTransition(NodeState.LOST)) @@ -740,6 +743,15 @@ public void transition(RMNodeImpl rmNode, RMNodeEvent event) { } } + public static class DecommissionInProgressNodeTransition implements + SingleArcTransition { + + @Override + public void transition(RMNodeImpl operand, RMNodeEvent event) { + + } + } + public static class StatusUpdateWhenHealthyTransition implements MultipleArcTransition { @Override diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java index 62713cf..3d675dc 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java @@ -40,7 +40,7 @@ public class TestNodesPage { final int numberOfRacks = 2; - final int numberOfNodesPerRack = 6; + final int numberOfNodesPerRack = 7; // The following is because of the way TestRMWebApp.mockRMContext creates // nodes. final int numberOfLostNodesPerRack = numberOfNodesPerRack