diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java index b2fd70f..16d1a25 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/NodeHeartbeatResponse.java @@ -45,4 +45,7 @@ long getNextHeartBeatInterval(); void setNextHeartBeatInterval(long nextHeartBeatInterval); + + String getDiagnosticsMessage(); + void setDiagnosticsMessage(String diagnosticsMessage); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java index 11b0211..7e180a6 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerResponse.java @@ -33,4 +33,9 @@ long getRMIdentifier(); void setRMIdentifier(long rmIdentifier); + + String getDiagnosticsMessage(); + + void setDiagnosticsMessage(String diagnosticsMessage); + } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java index 080a79c..88805e1 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/NodeHeartbeatResponsePBImpl.java @@ -146,6 +146,25 @@ public void setNodeAction(NodeAction nodeAction) { } @Override + public String getDiagnosticsMessage() { + NodeHeartbeatResponseProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasDiagnosticsMessage()) { + return null; + } + return p.getDiagnosticsMessage(); + } + + @Override + public void setDiagnosticsMessage(String diagnosticsMessage) { + maybeInitBuilder(); + if (diagnosticsMessage == null) { + builder.clearDiagnosticsMessage(); + return; + } + builder.setDiagnosticsMessage((diagnosticsMessage)); + } + + @Override public List getContainersToCleanup() { initContainersToCleanup(); return this.containersToCleanup; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java index 43451dc..7e34ca6 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerResponsePBImpl.java @@ -100,7 +100,26 @@ public void setMasterKey(MasterKey masterKey) { this.masterKey = masterKey; rebuild = true; } - + + @Override + public String getDiagnosticsMessage() { + RegisterNodeManagerResponseProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasDiagnosticsMessage()) { + return null; + } + return p.getDiagnosticsMessage(); + } + + @Override + public void setDiagnosticsMessage(String diagnosticsMessage) { + maybeInitBuilder(); + if (diagnosticsMessage == null) { + builder.clearDiagnosticsMessage(); + return; + } + builder.setDiagnosticsMessage((diagnosticsMessage)); + } + @Override public NodeAction getNodeAction() { RegisterNodeManagerResponseProtoOrBuilder p = viaProto ? proto : builder; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto index 704d1da..4424e23 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto @@ -34,6 +34,7 @@ message RegisterNodeManagerResponseProto { optional MasterKeyProto master_key = 1; optional NodeActionProto nodeAction = 2; optional int64 rm_identifier = 3; + optional string diagnosticsMessage = 4; } message NodeHeartbeatRequestProto { @@ -49,4 +50,5 @@ message NodeHeartbeatResponseProto { repeated ContainerIdProto containers_to_cleanup = 4; repeated ApplicationIdProto applications_to_cleanup = 5; optional int64 nextHeartBeatInterval = 6; + optional string diagnosticsMessage = 7; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 284cd94..b5c891e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -295,7 +295,8 @@ protected void registerWithRM() throws YarnRemoteException { // if the Resourcemanager instructs NM to shutdown. if (NodeAction.SHUTDOWN.equals(regNMResponse.getNodeAction())) { throw new YarnException( - "Recieved SHUTDOWN signal from Resourcemanager ,Registration of NodeManager failed"); + "Recieved SHUTDOWN signal from Resourcemanager ,Registration of NodeManager failed " + + regNMResponse.getDiagnosticsMessage()); } if (UserGroupInformation.isSecurityEnabled()) { @@ -481,15 +482,17 @@ public void run() { if (response.getNodeAction() == NodeAction.SHUTDOWN) { LOG - .info("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + + .warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + " hence shutting down."); + LOG.warn(response.getDiagnosticsMessage()); dispatcher.getEventHandler().handle( new NodeManagerEvent(NodeManagerEventType.SHUTDOWN)); break; } if (response.getNodeAction() == NodeAction.RESYNC) { - LOG.info("Node is out of sync with ResourceManager," + LOG.warn("Node is out of sync with ResourceManager," + " hence rebooting."); + LOG.warn(response.getDiagnosticsMessage()); // Invalidate the RMIdentifier while resync NodeStatusUpdaterImpl.this.rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index c4f0b4c..557eb48 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -174,8 +174,10 @@ public RegisterNodeManagerResponse registerNodeManager( // Check if this node is a 'valid' node if (!this.nodesListManager.isValidNode(host)) { - LOG.info("Disallowed NodeManager from " + host - + ", Sending SHUTDOWN signal to the NodeManager."); + String message = "Disallowed NodeManager from " + host + + ", Sending SHUTDOWN signal to the NodeManager."; + LOG.info(message); + response.setDiagnosticsMessage(message); response.setNodeAction(NodeAction.SHUTDOWN); return response; } @@ -183,9 +185,11 @@ public RegisterNodeManagerResponse registerNodeManager( // Check if this node has minimum allocations if (capability.getMemory() < minAllocMb || capability.getVirtualCores() < minAllocVcores) { - LOG.info("NodeManager from " + host + String message = "NodeManager from " + host + " doesn't satisfy minimum allocations, Sending SHUTDOWN" - + " signal to the NodeManager."); + + " signal to the NodeManager."; + LOG.info(message); + response.setDiagnosticsMessage(message); response.setNodeAction(NodeAction.SHUTDOWN); return response; } @@ -212,10 +216,11 @@ public RegisterNodeManagerResponse registerNodeManager( this.nmLivelinessMonitor.register(nodeId); - LOG.info("NodeManager from node " + host + "(cmPort: " + cmPort + String message = "NodeManager from node " + host + "(cmPort: " + cmPort + " httpPort: " + httpPort + ") " + "registered with capability: " - + capability + ", assigned nodeId " + nodeId); - + + capability + ", assigned nodeId " + nodeId; + LOG.info(message); + response.setDiagnosticsMessage(message); response.setNodeAction(NodeAction.NORMAL); response.setRMIdentifier(ResourceManager.clusterTimeStamp); return response; @@ -241,7 +246,9 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) RMNode rmNode = this.rmContext.getRMNodes().get(nodeId); if (rmNode == null) { /* node does not exist */ - LOG.info("Node not found rebooting " + remoteNodeStatus.getNodeId()); + String message = "Node not found rebooting " + remoteNodeStatus.getNodeId(); + LOG.info(message); + resync.setDiagnosticsMessage(message); return resync; } @@ -250,8 +257,10 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) // 2. Check if it's a valid (i.e. not excluded) node if (!this.nodesListManager.isValidNode(rmNode.getHostName())) { - LOG.info("Disallowed NodeManager nodeId: " + nodeId + " hostname: " - + rmNode.getNodeAddress()); + String message = "Disallowed NodeManager nodeId: " + nodeId + + " hostname: " + rmNode.getNodeAddress(); + LOG.info(message); + shutDown.setDiagnosticsMessage(message); this.rmContext.getDispatcher().getEventHandler().handle( new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION)); return shutDown; @@ -266,9 +275,11 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) return lastNodeHeartbeatResponse; } else if (remoteNodeStatus.getResponseId() + 1 < lastNodeHeartbeatResponse .getResponseId()) { - LOG.info("Too far behind rm response id:" - + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" - + remoteNodeStatus.getResponseId()); + String message = "Too far behind rm response id:" + + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + + remoteNodeStatus.getResponseId(); + LOG.info(message); + resync.setDiagnosticsMessage(message); // TODO: Just sending reboot is not enough. Think more. this.rmContext.getDispatcher().getEventHandler().handle( new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING)); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java index b316511..7e7ac21 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestRMNMRPCResponseId.java @@ -133,5 +133,6 @@ public void testRPCResponseId() throws IOException, YarnRemoteException { nodeStatus.setResponseId(0); response = resourceTrackerService.nodeHeartbeat(nodeHeartBeatRequest); Assert.assertTrue(NodeAction.RESYNC.equals(response.getNodeAction())); + Assert.assertEquals("Too far behind rm response id:2 nm response id:0", response.getDiagnosticsMessage()); } } \ No newline at end of file