diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 54e8888f0d2..175c9915e5e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1927,6 +1927,8 @@ public static boolean isAclEnabled(Configuration conf) { public static final int DEFAULT_NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS = 10000; + public static final String NM_HEALTH_CHECKER_SERVICE = + NM_PREFIX + "health-checker-service"; /** Prefix for all node manager disk health checker configs. */ private static final String NM_DISK_HEALTH_CHECK_PREFIX = "yarn.nodemanager.disk-health-checker."; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto index d7ca2a563ea..c7e22b7f2a4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto @@ -828,6 +828,11 @@ message StringStringMapProto { optional string value = 2; } +message StringIntMapProto { + optional string key = 1; + optional int32 value = 2; +} + message StringBytesMapProto { optional string key = 1; optional bytes value = 2; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java index 455ca24405f..04124ec7c0c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java @@ -81,6 +81,7 @@ import org.apache.hadoop.yarn.proto.YarnProtos.QueueStateProto; import org.apache.hadoop.yarn.proto.YarnProtos.ReservationRequestInterpreterProto; import org.apache.hadoop.yarn.proto.YarnProtos.ResourceProto; +import org.apache.hadoop.yarn.proto.YarnProtos.StringIntMapProto; import org.apache.hadoop.yarn.proto.YarnProtos.StringStringMapProto; import org.apache.hadoop.yarn.proto.YarnProtos.TimedPlacementConstraintProto; import org.apache.hadoop.yarn.proto.YarnProtos.YarnApplicationAttemptStateProto; @@ -587,6 +588,33 @@ public static ResourceTypes convertFromProtoFormat(ResourceTypesProto e) { return ret; } + public static List + convertStringIntMapToProtoList (Map stringIntMap) { + List pList = new ArrayList<>(); + if (stringIntMap != null && !stringIntMap.isEmpty()) { + StringIntMapProto.Builder pBuilder = StringIntMapProto.newBuilder(); + for (Map.Entry entry : stringIntMap.entrySet()) { + pBuilder.setKey(entry.getKey()); + pBuilder.setValue(entry.getValue()); + pList.add(pBuilder.build()); + } + } + return pList; + } + + public static Map + convertProtoListToStringIntMap (List pList) { + Map ret = new HashMap<>(); + if (pList != null) { + for (StringIntMapProto p : pList) { + if (p.hasKey()) { + ret.put(p.getKey(), p.getValue()); + } + } + } + return ret; + } + public static Map convertStringStringMapProtoListToMap( List pList) { Map ret = new HashMap<>(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthDetails.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthDetails.java new file mode 100644 index 00000000000..102b02a4b33 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthDetails.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.api.records; + +import java.util.Map; + +/** + * {@code NodeHealthDetails} is a summary of the overall health score + * of the node. + *

+ * It includes information such as: + *

    + *
  • + * In depth analysis of the health of the node. Even if the node is healthy + * it gives out a score based on node resources. + *
  • + *
  • Holds a map of information about the node resources. + * Example: SSD, HDD, SKU etc.
  • + *
+ * + */ + + public class NodeHealthDetails { + + private Integer overallScore; + private Map nodeResources; + + public void setOverallScore(Integer overallScore) { + this.overallScore = overallScore; + } + + public void setNodeResources(Map nodeResources) { + this.nodeResources = nodeResources; + } + + public Integer getOverallScore() { + return overallScore; + } + + public Map getNodeResources() { + return nodeResources; + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthStatus.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthStatus.java index b21b88071f5..d3f3e702929 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthStatus.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/NodeHealthStatus.java @@ -90,4 +90,20 @@ public static NodeHealthStatus newInstance(boolean isNodeHealthy, @Private @Unstable public abstract void setLastHealthReportTime(long lastHealthReport); + + /** + * Get the detailed description of the node health + */ + @Public + @Stable + public abstract String getNodeHealthDescription(); + + @Private + @Unstable + public abstract void setNodeHealthDescription(String nodeHealthDescription); + + public abstract void setNodeHealthDetails(NodeHealthDetails + nodeHealthDetail); + + public abstract NodeHealthDetails getNodeHealthDetails(); } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthDetailsPBImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthDetailsPBImpl.java new file mode 100644 index 00000000000..c0005f73108 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthDetailsPBImpl.java @@ -0,0 +1,76 @@ +package org.apache.hadoop.yarn.server.api.records.impl.pb; + +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils; +//import org.apache.hadoop.yarn.proto.YarnServerCommonProtos. +import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.NodeHealthDetailsProto; +import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.NodeHealthDetailsProtoOrBuilder; +import org.apache.hadoop.yarn.server.api.records.NodeHealthDetails; + +import java.util.Map; + +public class NodeHealthDetailsPBImpl extends NodeHealthDetails { + NodeHealthDetailsProto proto = NodeHealthDetailsProto.getDefaultInstance(); + NodeHealthDetailsProto.Builder builder = null; + boolean viaProto = false; + + public NodeHealthDetailsPBImpl() { + builder = NodeHealthDetailsProto.newBuilder(); + } + + public NodeHealthDetailsPBImpl(NodeHealthDetailsProto proto) { + this.proto = proto; + viaProto = true; + } + public synchronized NodeHealthDetailsProto getProto() { + mergeLocalToProto(); + proto = viaProto ? proto : builder.build(); + viaProto = true; + return proto; + } + + private synchronized void mergeLocalToProto() { + if (viaProto) + maybeInitBuilder(); + proto = builder.build(); + + viaProto = true; + } + private synchronized void maybeInitBuilder() { + if (viaProto || builder == null) { + builder = NodeHealthDetailsProto.newBuilder(proto); + } + viaProto = false; + } + @Override + public void setOverallScore(Integer overallScore) { + maybeInitBuilder(); + this.builder.setOverallScore(overallScore); + } + + @Override + public Integer getOverallScore() { + NodeHealthDetailsProtoOrBuilder p = + this.viaProto ? this.proto : this.builder; + if (!p.hasOverallScore()) { + return null; + } + return (p.getOverallScore()); + } + + @Override + public void setNodeResources(Map nodeResources) { + maybeInitBuilder(); + this.builder.addAllNodeResources(ProtoUtils + .convertStringIntMapToProtoList(nodeResources)); + } + + @Override + public Map getNodeResources() { + NodeHealthDetailsProtoOrBuilder p = + this.viaProto ? this.proto : this.builder; + return p.getNodeResourcesCount() > 0 ? + ProtoUtils.convertProtoListToStringIntMap(p.getNodeResourcesList()) : + ImmutableMap.of(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthStatusPBImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthStatusPBImpl.java index 20697834687..60bde365a8b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthStatusPBImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/records/impl/pb/NodeHealthStatusPBImpl.java @@ -17,19 +17,23 @@ */ package org.apache.hadoop.yarn.server.api.records.impl.pb; - +import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils; +import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.NodeHealthDetailsProto; import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.NodeHealthStatusProto; import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.NodeHealthStatusProtoOrBuilder; +import org.apache.hadoop.yarn.server.api.records.NodeHealthDetails; import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus; import org.apache.hadoop.thirdparty.protobuf.TextFormat; public class NodeHealthStatusPBImpl extends NodeHealthStatus { + private NodeHealthStatusProto proto = NodeHealthStatusProto + .getDefaultInstance(); private NodeHealthStatusProto.Builder builder; private boolean viaProto = false; - private NodeHealthStatusProto proto = NodeHealthStatusProto - .getDefaultInstance(); + + private NodeHealthDetails nodeHealthDetails; public NodeHealthStatusPBImpl() { this.builder = NodeHealthStatusProto.newBuilder(); @@ -66,7 +70,6 @@ public boolean equals(Object other) { public String toString() { return TextFormat.shortDebugString(getProto()); } - private void mergeLocalToProto() { if (this.viaProto) maybeInitBuilder(); @@ -128,4 +131,56 @@ public void setLastHealthReportTime(long lastHealthReport) { this.builder.setLastHealthReportTime((lastHealthReport)); } + @Override + public String getNodeHealthDescription() { + NodeHealthStatusProtoOrBuilder p = + this.viaProto ? this.proto : this.builder; + if (!p.hasNodeHealthDescription()) { + return null; + } + return (p.getNodeHealthDescription()); + } + + @Override + public void setNodeHealthDescription(String nodeHealthDescription) { + maybeInitBuilder(); + if (nodeHealthDescription == null) { + this.builder.clearHealthReport(); + return; + } + this.builder.setNodeHealthDescription(nodeHealthDescription); + } + + @Override + public void setNodeHealthDetails( + NodeHealthDetails nodeHealthDetails) { + maybeInitBuilder(); + if(nodeHealthDetails == null) { + this.builder.clearNodeHealthDetails(); + } + this.nodeHealthDetails = nodeHealthDetails; + this.builder + .setNodeHealthDetails(NodeHealthDetailsProto.newBuilder().setOverallScore(nodeHealthDetails.getOverallScore()) + .addAllNodeResources(ProtoUtils + .convertStringIntMapToProtoList(nodeHealthDetails.getNodeResources())).build()); + } + + @Override + public NodeHealthDetails getNodeHealthDetails() { + NodeHealthStatusProtoOrBuilder p = + this.viaProto ? this.proto : this.builder; + if(this.nodeHealthDetails != null) { + return this.nodeHealthDetails; + } + if(!p.hasNodeHealthDetails()) { + return null; + } + this.nodeHealthDetails = convertFromProtoFormat(p.getNodeHealthDetails()); + return this.nodeHealthDetails; + } + + private NodeHealthDetailsPBImpl convertFromProtoFormat( + NodeHealthDetailsProto p) { + return new NodeHealthDetailsPBImpl(p); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_protos.proto b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_protos.proto index ea8df4fb800..bbdf51c805e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_protos.proto +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_protos.proto @@ -58,10 +58,17 @@ message MasterKeyProto { optional bytes bytes = 2; } +message NodeHealthDetailsProto { + optional int32 overall_score = 1; + repeated StringIntMapProto node_resources = 2; +} + message NodeHealthStatusProto { optional bool is_node_healthy = 1; optional string health_report = 2; optional int64 last_health_report_time = 3; + optional string node_health_description = 4; + optional NodeHealthDetailsProto node_health_details = 5; } message VersionProto { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index f90423cf6b6..f767bb1c77e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -29,6 +29,7 @@ import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.GenericOptionsParser; @@ -62,6 +63,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthService; import org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker.NMLogAggregationStatusTracker; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider; @@ -85,6 +87,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.reflect.Constructor; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -126,7 +129,8 @@ public int getExitCode() { protected final NodeManagerMetrics metrics = NodeManagerMetrics.create(); private JvmPauseMonitor pauseMonitor; private ApplicationACLsManager aclsManager; - private NodeHealthCheckerService nodeHealthChecker; + private NodeHealthService nodeHealthChecker; + private NodeHealthCheckerService nhcs; private NodeLabelsProvider nodeLabelsProvider; private NodeAttributesProvider nodeAttributesProvider; private LocalDirsHandlerService dirsHandler; @@ -170,6 +174,11 @@ protected NodeStatusUpdater createNodeStatusUpdater(Context context, return new NodeStatusUpdaterImpl(context, dispatcher, healthChecker, metrics); } + protected NodeStatusUpdater createNodeStatusUpdater(Context context, + Dispatcher dispatcher, NodeHealthService healthService) { + return new NodeStatusUpdaterImpl(context, dispatcher, healthService, + metrics); + } protected NodeAttributesProvider createNodeAttributesProvider( Configuration conf) throws IOException { @@ -359,6 +368,23 @@ protected ContainerExecutor createContainerExecutor(Configuration conf) { DefaultContainerExecutor.class, ContainerExecutor.class), conf); } + protected NodeHealthService addNodeHealthService(Configuration conf) { + + Class clazz = conf.getClassByNameOrNull(conf.get(YarnConfiguration + .NM_HEALTH_CHECKER_SERVICE, NodeHealthCheckerService.class.getName())); + try { + if (clazz == null || !AbstractService.class.isAssignableFrom(clazz)) + throw new RuntimeException(clazz + " does not implement " + AbstractService.class); + + Constructor cons = clazz.getConstructor(NodeManager.class); + NodeHealthService service = (NodeHealthService) cons.newInstance(this); + return service; + } catch (Exception e) { + throw new YarnRuntimeException( + "Could not instantiate NodeHealth Class: " + YarnConfiguration + .NM_HEALTH_CHECKER_SERVICE, e); + } + } @Override protected void serviceInit(Configuration conf) throws Exception { UserGroupInformation.setConfiguration(conf); @@ -410,8 +436,8 @@ protected void serviceInit(Configuration conf) throws Exception { // NodeManager level dispatcher this.dispatcher = createNMDispatcher(); - this.nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); - addService(nodeHealthChecker); + this.nodeHealthChecker = addNodeHealthService(conf); + addIfService(nodeHealthChecker); ((NMContext)context).setContainerExecutor(exec); ((NMContext)context).setDeletionService(del); @@ -931,7 +957,8 @@ public AuxServices getAuxServices() { * @return the node health checker */ public NodeHealthCheckerService getNodeHealthChecker() { - return nodeHealthChecker; + //This is to avoid compilation failure. Will work on this change + return nhcs; } private void initAndStartNodeManager(Configuration conf, boolean hasToReboot) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 0725d423096..13168621cc3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -87,6 +87,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -147,6 +148,7 @@ private final List logAggregationReportForAppsTempList; private final NodeHealthCheckerService healthChecker; + private final NodeHealthService healthService; private final NodeManagerMetrics metrics; private Runnable statusUpdaterRunnable; @@ -167,6 +169,7 @@ public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { super(NodeStatusUpdaterImpl.class.getName()); this.healthChecker = healthChecker; + this.healthService = null; this.context = context; this.dispatcher = dispatcher; this.metrics = metrics; @@ -177,6 +180,20 @@ public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher, new ArrayList(); } + public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher, + NodeHealthService healthChecker, NodeManagerMetrics metrics) { + super(NodeStatusUpdaterImpl.class.getName()); + this.healthChecker = null; + this.healthService = healthChecker; + this.context = context; + this.dispatcher = dispatcher; + this.metrics = metrics; + this.recentlyStoppedContainers = new LinkedHashMap(); + this.pendingCompletedContainers = + new HashMap(); + this.logAggregationReportForAppsTempList = + new ArrayList(); + } @Override public void setNodeAttributesProvider(NodeAttributesProvider provider) { this.nodeAttributesProvider = provider; @@ -513,10 +530,12 @@ protected void registerWithRM() protected NodeStatus getNodeStatus(int responseId) throws IOException { NodeHealthStatus nodeHealthStatus = this.context.getNodeHealthStatus(); - nodeHealthStatus.setHealthReport(healthChecker.getHealthReport()); - nodeHealthStatus.setIsNodeHealthy(healthChecker.isHealthy()); - nodeHealthStatus.setLastHealthReportTime(healthChecker + nodeHealthStatus.setHealthReport(healthService.getHealthReport()); + nodeHealthStatus.setIsNodeHealthy(healthService.isHealthy()); + nodeHealthStatus.setLastHealthReportTime(healthService .getLastHealthReportTime()); + nodeHealthStatus.setNodeHealthDescription(healthService.getNodeHealthDescription()); + nodeHealthStatus.setNodeHealthDetails(healthService.getNodeHealthDetails()); LOG.debug("Node's health-status : {}, {}", nodeHealthStatus.getIsNodeHealthy(), nodeHealthStatus.getHealthReport()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java index a89fb86362b..35c462874e6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java @@ -25,12 +25,17 @@ import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.service.Service; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.api.records.NodeHealthDetails; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; @@ -51,7 +56,7 @@ * @see TimedHealthReporterService */ public class NodeHealthCheckerService extends CompositeService - implements HealthReporter { + implements HealthReporter, NodeHealthService { public static final Logger LOG = LoggerFactory.getLogger(NodeHealthCheckerService.class); @@ -72,6 +77,15 @@ public NodeHealthCheckerService( this.exceptionReporter = new ExceptionReporter(); } + public NodeHealthCheckerService( + NodeManager nodeManager) { + super(NodeHealthCheckerService.class.getName()); + + this.reporters = new ArrayList<>(); + this.dirsHandler = nodeManager.getNMContext().getLocalDirsHandler(); + this.exceptionReporter = new ExceptionReporter(); + } + @Override protected void serviceInit(Configuration conf) throws Exception { reporters.add(exceptionReporter); @@ -147,6 +161,10 @@ public long getLastHealthReportTime() { return max.orElse(0L); } + @Override public String getNodeHealthDescription() { + return "Node Health Description"; + } + /** * @return the disk handler */ @@ -154,6 +172,15 @@ public LocalDirsHandlerService getDiskHandler() { return dirsHandler; } + public NodeHealthDetails getNodeHealthDetails() { + + // This is for example purpose. Will remove it. + NodeHealthDetails n = new NodeHealthDetails(); + n.setOverallScore(50); + n.setNodeResources(Collections.singletonMap("SSD", 1)); + return n; + } + /** * Propagating an exception to {@link ExceptionReporter}. * @param exception the exception to propagate diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthService.java new file mode 100644 index 00000000000..63121e77f1f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthService.java @@ -0,0 +1,13 @@ +package org.apache.hadoop.yarn.server.nodemanager.health; + +import org.apache.hadoop.yarn.server.api.records.NodeHealthDetails; + + +public interface NodeHealthService { + + boolean isHealthy(); + String getHealthReport(); + long getLastHealthReportTime(); + String getNodeHealthDescription(); + NodeHealthDetails getNodeHealthDetails(); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index 68f44dc6d54..e397f33f2bc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -36,6 +36,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.commons.collections.keyvalue.DefaultMapEntry; +import org.apache.hadoop.yarn.server.api.records.NodeHealthDetails; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -132,6 +133,8 @@ private String healthReport; private long lastHealthReportTime; + private String nodeHealthDescription; + private NodeHealthDetails nodeHealthDetails; private String nodeManagerVersion; private Integer decommissioningTimeout; @@ -502,7 +505,7 @@ public void setHealthReport(String healthReport) { this.writeLock.unlock(); } } - + public void setLastHealthReportTime(long lastHealthReportTime) { this.writeLock.lock(); @@ -512,7 +515,47 @@ public void setLastHealthReportTime(long lastHealthReportTime) { this.writeLock.unlock(); } } - + + public void setNodeHealthDescription(String nodeHealthDescription) { + this.writeLock.lock(); + + try { + this.nodeHealthDescription = nodeHealthDescription; + } finally { + this.writeLock.unlock(); + } + } + + public String getNodeHealthDescription() { + this.writeLock.lock(); + + try { + return this.nodeHealthDescription; + } finally { + this.writeLock.unlock(); + } + } + + public void setNodeHealthDetails(NodeHealthDetails nodeHealthDetail) { + this.writeLock.lock(); + + try { + this.nodeHealthDetails = nodeHealthDetails; + } finally { + this.writeLock.unlock(); + } + } + + public NodeHealthDetails getNodeHealthDetail() { + this.writeLock.lock(); + + try { + return this.nodeHealthDetails; + } finally { + this.writeLock.unlock(); + } + } + @Override public long getLastHealthReportTime() { this.readLock.lock(); @@ -844,6 +887,10 @@ private static NodeHealthStatus updateRMNodeFromStatusEvents( rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport()); rmNode.setLastHealthReportTime(remoteNodeHealthStatus .getLastHealthReportTime()); + rmNode.setNodeHealthDetails(remoteNodeHealthStatus + .getNodeHealthDetails()); + rmNode.setNodeHealthDescription(remoteNodeHealthStatus + .getNodeHealthDescription()); rmNode.setAggregatedContainersUtilization(statusEvent .getAggregatedContainersUtilization()); rmNode.setNodeUtilization(statusEvent.getNodeUtilization());