diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java index b2fd70f..6b839b1 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java @@ -45,4 +45,8 @@ long getNextHeartBeatInterval(); void setNextHeartBeatInterval(long nextHeartBeatInterval); + + String getDiagnosticsMessage(); + + void setDiagnosticsMessage(String diagnosticsMessage); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java index 11b0211..0a87acf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java @@ -33,4 +33,9 @@ long getRMIdentifier(); void setRMIdentifier(long rmIdentifier); + + String getDiagnosticsMessage(); + + void setDiagnosticsMessage(String diagnosticsMessage); + } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java index 080a79c..e096a4e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java @@ -146,6 +146,25 @@ public void setNodeAction(NodeAction nodeAction) { } @Override + public String getDiagnosticsMessage() { + NodeHeartbeatResponseProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasDiagnosticsMessage()) { + return null; + } + return p.getDiagnosticsMessage(); + } + + @Override + public void setDiagnosticsMessage(String diagnosticsMessage) { + maybeInitBuilder(); + if (diagnosticsMessage == null) { + builder.clearDiagnosticsMessage(); + return; + } + builder.setDiagnosticsMessage((diagnosticsMessage)); + } + + @Override public List getContainersToCleanup() { initContainersToCleanup(); return this.containersToCleanup; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java index 43451dc..69d7374 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java @@ -102,6 +102,25 @@ public void setMasterKey(MasterKey masterKey) { } @Override + public String getDiagnosticsMessage() { + RegisterNodeManagerResponseProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasDiagnosticsMessage()) { + return null; + } + return p.getDiagnosticsMessage(); + } + + @Override + public void setDiagnosticsMessage(String diagnosticsMessage) { + maybeInitBuilder(); + if (diagnosticsMessage == null) { + builder.clearDiagnosticsMessage(); + return; + } + builder.setDiagnosticsMessage((diagnosticsMessage)); + } + + @Override public NodeAction getNodeAction() { RegisterNodeManagerResponseProtoOrBuilder p = viaProto ? proto : builder; if(!p.hasNodeAction()) { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto index 704d1da..d7c3d1d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto @@ -34,6 +34,7 @@ message RegisterNodeManagerResponseProto { optional MasterKeyProto master_key = 1; optional NodeActionProto nodeAction = 2; optional int64 rm_identifier = 3; + optional string diagnostics_message = 4; } message NodeHeartbeatRequestProto { @@ -49,4 +50,5 @@ message NodeHeartbeatResponseProto { repeated ContainerIdProto containers_to_cleanup = 4; repeated ApplicationIdProto applications_to_cleanup = 5; optional int64 nextHeartBeatInterval = 6; + optional string diagnostics_message = 7; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 284cd94..e2d747c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -294,8 +294,12 @@ protected void registerWithRM() throws YarnRemoteException { } // if the Resourcemanager instructs NM to shutdown. if (NodeAction.SHUTDOWN.equals(regNMResponse.getNodeAction())) { + String message = + "Message from ResourceManager: " + + regNMResponse.getDiagnosticsMessage(); throw new YarnException( - "Recieved SHUTDOWN signal from Resourcemanager ,Registration of NodeManager failed"); + "Recieved SHUTDOWN signal from Resourcemanager ,Registration of NodeManager failed, " + + message); } if (UserGroupInformation.isSecurityEnabled()) { @@ -481,15 +485,19 @@ public void run() { if (response.getNodeAction() == NodeAction.SHUTDOWN) { LOG - .info("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + - " hence shutting down."); + .warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + + " hence shutting down."); + LOG.warn("Message from ResourceManager: " + + response.getDiagnosticsMessage()); dispatcher.getEventHandler().handle( new NodeManagerEvent(NodeManagerEventType.SHUTDOWN)); break; } if (response.getNodeAction() == NodeAction.RESYNC) { - LOG.info("Node is out of sync with ResourceManager," + LOG.warn("Node is out of sync with ResourceManager," + " hence rebooting."); + LOG.warn("Message from ResourceManager: " + + response.getDiagnosticsMessage()); // Invalidate the RMIdentifier while resync NodeStatusUpdaterImpl.this.rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 10dd155..9d9b0cf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -387,6 +387,7 @@ public void stop() { private class MyResourceTracker2 implements ResourceTracker { public NodeAction heartBeatNodeAction = NodeAction.NORMAL; public NodeAction registerNodeAction = NodeAction.NORMAL; + public String shutDownMessage = ""; @Override public RegisterNodeManagerResponse registerNodeManager( @@ -395,6 +396,7 @@ public RegisterNodeManagerResponse registerNodeManager( RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); response.setNodeAction(registerNodeAction ); + response.setDiagnosticsMessage(shutDownMessage); return response; } @Override @@ -406,6 +408,7 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils. newNodeHeartbeatResponse(heartBeatID, heartBeatNodeAction, null, null, null, 1000L); + nhResponse.setDiagnosticsMessage(shutDownMessage); return nhResponse; } } @@ -732,12 +735,15 @@ protected NodeStatusUpdater createNodeStatusUpdater(Context context, context, dispatcher, healthChecker, metrics); MyResourceTracker2 myResourceTracker2 = new MyResourceTracker2(); myResourceTracker2.registerNodeAction = NodeAction.SHUTDOWN; + myResourceTracker2.shutDownMessage = "RM Shutting Down Node"; nodeStatusUpdater.resourceTracker = myResourceTracker2; return nodeStatusUpdater; } }; verifyNodeStartFailure("org.apache.hadoop.yarn.YarnException: " - + "Recieved SHUTDOWN signal from Resourcemanager ,Registration of NodeManager failed"); + + "Recieved SHUTDOWN signal from Resourcemanager ," + + "Registration of NodeManager failed, " + + "Message from ResourceManager: RM Shutting Down Node"); } @Test (timeout = 15000) diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index c4f0b4c..a7a39b8 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -174,8 +174,11 @@ public RegisterNodeManagerResponse registerNodeManager( // Check if this node is a 'valid' node if (!this.nodesListManager.isValidNode(host)) { - LOG.info("Disallowed NodeManager from " + host - + ", Sending SHUTDOWN signal to the NodeManager."); + String message = + "Disallowed NodeManager from " + host + + ", Sending SHUTDOWN signal to the NodeManager."; + LOG.info(message); + response.setDiagnosticsMessage(message); response.setNodeAction(NodeAction.SHUTDOWN); return response; } @@ -183,9 +186,12 @@ public RegisterNodeManagerResponse registerNodeManager( // Check if this node has minimum allocations if (capability.getMemory() < minAllocMb || capability.getVirtualCores() < minAllocVcores) { - LOG.info("NodeManager from " + host - + " doesn't satisfy minimum allocations, Sending SHUTDOWN" - + " signal to the NodeManager."); + String message = + "NodeManager from " + host + + " doesn't satisfy minimum allocations, Sending SHUTDOWN" + + " signal to the NodeManager."; + LOG.info(message); + response.setDiagnosticsMessage(message); response.setNodeAction(NodeAction.SHUTDOWN); return response; } @@ -212,10 +218,11 @@ public RegisterNodeManagerResponse registerNodeManager( this.nmLivelinessMonitor.register(nodeId); - LOG.info("NodeManager from node " + host + "(cmPort: " + cmPort - + " httpPort: " + httpPort + ") " + "registered with capability: " - + capability + ", assigned nodeId " + nodeId); - + String message = + "NodeManager from node " + host + "(cmPort: " + cmPort + " httpPort: " + + httpPort + ") " + "registered with capability: " + capability + + ", assigned nodeId " + nodeId; + LOG.info(message); response.setNodeAction(NodeAction.NORMAL); response.setRMIdentifier(ResourceManager.clusterTimeStamp); return response; @@ -241,7 +248,9 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) RMNode rmNode = this.rmContext.getRMNodes().get(nodeId); if (rmNode == null) { /* node does not exist */ - LOG.info("Node not found rebooting " + remoteNodeStatus.getNodeId()); + String message = "Node not found rebooting " + remoteNodeStatus.getNodeId(); + LOG.info(message); + resync.setDiagnosticsMessage(message); return resync; } @@ -250,8 +259,11 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) // 2. Check if it's a valid (i.e. not excluded) node if (!this.nodesListManager.isValidNode(rmNode.getHostName())) { - LOG.info("Disallowed NodeManager nodeId: " + nodeId + " hostname: " - + rmNode.getNodeAddress()); + String message = + "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + + rmNode.getNodeAddress(); + LOG.info(message); + shutDown.setDiagnosticsMessage(message); this.rmContext.getDispatcher().getEventHandler().handle( new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION)); return shutDown; @@ -266,9 +278,12 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) return lastNodeHeartbeatResponse; } else if (remoteNodeStatus.getResponseId() + 1 < lastNodeHeartbeatResponse .getResponseId()) { - LOG.info("Too far behind rm response id:" - + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" - + remoteNodeStatus.getResponseId()); + String message = + "Too far behind rm response id:" + + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + + remoteNodeStatus.getResponseId(); + LOG.info(message); + resync.setDiagnosticsMessage(message); // TODO: Just sending reboot is not enough. Think more. this.rmContext.getDispatcher().getEventHandler().handle( new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING)); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index 3ac19b4..5f12b33 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -268,6 +268,10 @@ public void testNodeRegistrationFailure() throws Exception { // trying to register a invalid node. RegisterNodeManagerResponse response = resourceTrackerService.registerNodeManager(req); Assert.assertEquals(NodeAction.SHUTDOWN,response.getNodeAction()); + Assert + .assertEquals( + "Disallowed NodeManager from host2, Sending SHUTDOWN signal to the NodeManager.", + response.getDiagnosticsMessage()); } @Test @@ -344,6 +348,8 @@ public void testReboot() throws Exception { nodeHeartbeat = nm2.nodeHeartbeat( new HashMap>(), true, -100); Assert.assertTrue(NodeAction.RESYNC.equals(nodeHeartbeat.getNodeAction())); + Assert.assertEquals("Too far behind rm response id:0 nm response id:-100", + nodeHeartbeat.getDiagnosticsMessage()); checkRebootedNMCount(rm, ++initialMetricCount); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java index b316511..fa63e84 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java @@ -133,5 +133,7 @@ public void testRPCResponseId() throws IOException, YarnRemoteException { nodeStatus.setResponseId(0); response = resourceTrackerService.nodeHeartbeat(nodeHeartBeatRequest); Assert.assertTrue(NodeAction.RESYNC.equals(response.getNodeAction())); + Assert.assertEquals("Too far behind rm response id:2 nm response id:0", + response.getDiagnosticsMessage()); } } \ No newline at end of file