From 9bdb650400311b97adac550c75fb5b09560e534e Mon Sep 17 00:00:00 2001 From: Adam Antal Date: Tue, 26 Nov 2019 15:55:29 +0100 Subject: [PATCH] YARN-9923. Introduce HealthReporter interface and implement running Docker daemon checker --- .../hadoop/yarn/conf/YarnConfiguration.java | 31 ++- .../src/main/resources/yarn-default.xml | 20 +- .../nodemanager/LocalDirsHandlerService.java | 21 +- .../nodemanager/NodeHealthCheckerService.java | 123 --------- .../yarn/server/nodemanager/NodeManager.java | 28 +- .../nodemanager/NodeStatusUpdaterImpl.java | 4 +- .../nodemanager/health/ExceptionReporter.java | 61 +++++ .../nodemanager/health/HealthReporter.java | 65 +++++ .../health/NodeHealthCheckerService.java | 163 +++++++++++ .../health}/NodeHealthScriptRunner.java | 252 +++++++----------- .../health/TimedHealthReporterService.java | 140 ++++++++++ .../nodemanager/MockNodeStatusUpdater.java | 1 + .../nodemanager/NodeManagerTestBase.java | 1 + .../server/nodemanager/TestEventFlow.java | 5 +- .../nodemanager/TestNodeManagerReboot.java | 1 + .../nodemanager/TestNodeManagerResync.java | 1 + .../nodemanager/TestNodeManagerShutdown.java | 1 + .../nodemanager/TestNodeStatusUpdater.java | 19 ++ .../TestNodeStatusUpdaterForAttributes.java | 1 + .../TestNodeStatusUpdaterForLabels.java | 1 + .../BaseContainerManagerTest.java | 5 +- .../TestContainerManagerRecovery.java | 6 +- .../TestResourcePluginManager.java | 2 +- .../health/TestExceptionReporter.java | 41 +++ .../TestNodeHealthCheckerService.java} | 153 ++++++++--- .../health}/TestNodeHealthScriptRunner.java | 93 ++++--- .../webapp/TestContainerLogsPage.java | 12 +- .../webapp/TestNMContainerWebSocket.java | 13 +- .../nodemanager/webapp/TestNMWebServer.java | 14 +- .../nodemanager/webapp/TestNMWebServices.java | 6 +- .../webapp/TestNMWebServicesApps.java | 6 +- .../webapp/TestNMWebServicesAuxServices.java | 6 +- .../webapp/TestNMWebServicesContainers.java | 8 +- .../nodemanager/webapp/TestNMWebTerminal.java | 12 +- .../hadoop/yarn/server/MiniYARNCluster.java | 4 +- .../src/site/markdown/NodeManager.md | 21 +- 36 files changed, 882 insertions(+), 459 deletions(-) delete mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/ExceptionReporter.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java rename {hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util => hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health}/NodeHealthScriptRunner.java (56%) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestExceptionReporter.java rename hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/{TestNodeHealthService.java => health/TestNodeHealthCheckerService.java} (55%) rename {hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util => hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health}/TestNodeHealthScriptRunner.java (58%) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index d23b6301efa..40729d4e6eb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1966,24 +1966,37 @@ public static boolean isAclEnabled(Configuration conf) { */ public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0; + /** The health checker scripts. */ + public static final String NM_HEALTH_CHECK_SCRIPTS = + NM_PREFIX + "health-checker.scripts"; + public static final String[] DEFAULT_NM_HEALTH_CHECK_SCRIPTS = {"script"}; + /** Frequency of running node health script.*/ public static final String NM_HEALTH_CHECK_INTERVAL_MS = - NM_PREFIX + "health-checker.interval-ms"; + NM_PREFIX + "health-checker.interval-ms"; public static final long DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS = 10 * 60 * 1000; + /** Health check time out period for all scripts.*/ + public static final String NM_HEALTH_CHECK_TIMEOUT_MS = + NM_PREFIX + "health-checker.timeout-ms"; + public static final long DEFAULT_NM_HEALTH_CHECK_TIMEOUT_MS = + 2 * DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS; + /** Health check script time out period.*/ - public static final String NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS = - NM_PREFIX + "health-checker.script.timeout-ms"; - public static final long DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS = - 2 * DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS; + public static final String NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE = + NM_PREFIX + "health-checker.%s.timeout-ms"; /** The health check script to run.*/ - public static final String NM_HEALTH_CHECK_SCRIPT_PATH = - NM_PREFIX + "health-checker.script.path"; + public static final String NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE = + NM_PREFIX + "health-checker.%s.path"; /** The arguments to pass to the health check script.*/ - public static final String NM_HEALTH_CHECK_SCRIPT_OPTS = - NM_PREFIX + "health-checker.script.opts"; + public static final String NM_HEALTH_CHECK_SCRIPT_OPTS_TEMPLATE = + NM_PREFIX + "health-checker.%s.opts"; + + /** Frequency of running node health script. */ + public static final String NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE = + NM_PREFIX + "health-checker.%s.interval-ms"; /** The JVM options used on forking ContainerLocalizer process by container executor. */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index e53801b58a8..bf0b9272f06 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1617,27 +1617,27 @@ - Frequency of running node health script. + yarn.nodemanager.health-checker.interval-ms 600000 - Script time out period. - yarn.nodemanager.health-checker.script.timeout-ms - 1200000 + The nodemanager health check scripts to run. + yarn.nodemanager.health-checker.scripts + script - The health check script to run. - yarn.nodemanager.health-checker.script.path - + Health check script time out period. + yarn.nodemanager.health-checker.timeout-ms + 1200000 - The arguments to pass to the health check script. - yarn.nodemanager.health-checker.script.opts - + Frequency of running node health scripts. + yarn.nodemanager.health-checker.interval-ms + 600000 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 8d060b01adf..fee88478e01 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -29,9 +29,11 @@ import java.util.TimerTask; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskValidator; import org.apache.hadoop.util.DiskValidatorFactory; +import org.apache.hadoop.yarn.server.nodemanager.health.HealthReporter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,7 +44,6 @@ import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; @@ -54,7 +55,8 @@ * directories of a node. This specifically manages nodemanager-local-dirs and * nodemanager-log-dirs by periodically checking their health. */ -public class LocalDirsHandlerService extends AbstractService { +public class LocalDirsHandlerService extends AbstractService + implements HealthReporter { private static final Logger LOG = LoggerFactory.getLogger(LocalDirsHandlerService.class); @@ -426,6 +428,11 @@ public String getDisksHealthReport(boolean listGoodDirs) { } + @Override + public String getHealthReport() { + return getDisksHealthReport(false); + } + /** * The minimum fraction of number of disks needed to be healthy for a node to * be considered healthy in terms of disks is configured using @@ -457,10 +464,20 @@ public boolean areDisksHealthy() { return true; } + @Override + public boolean isHealthy() { + return areDisksHealthy(); + } + public long getLastDisksCheckTime() { return lastDisksCheckTime; } + @Override + public long getLastHealthReportTime() { + return getLastDisksCheckTime(); + } + public boolean isGoodLocalDir(String path) { return isInGoodDirs(getLocalDirs(), path); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java deleted file mode 100644 index 7e2fc7e022d..00000000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java +++ /dev/null @@ -1,123 +0,0 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.hadoop.yarn.server.nodemanager; - -import com.google.common.base.Joiner; -import com.google.common.base.Strings; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.service.CompositeService; -import org.apache.hadoop.util.NodeHealthScriptRunner; - -import java.util.Arrays; -import java.util.Collections; - -/** - * The class which provides functionality of checking the health of the node and - * reporting back to the service for which the health checker has been asked to - * report. - */ -public class NodeHealthCheckerService extends CompositeService { - - private NodeHealthScriptRunner nodeHealthScriptRunner; - private LocalDirsHandlerService dirsHandler; - private Exception nodeHealthException; - private long nodeHealthExceptionReportTime; - - static final String SEPARATOR = ";"; - - public NodeHealthCheckerService(NodeHealthScriptRunner scriptRunner, - LocalDirsHandlerService dirHandlerService) { - super(NodeHealthCheckerService.class.getName()); - nodeHealthScriptRunner = scriptRunner; - dirsHandler = dirHandlerService; - nodeHealthException = null; - nodeHealthExceptionReportTime = 0; - } - - @Override - protected void serviceInit(Configuration conf) throws Exception { - if (nodeHealthScriptRunner != null) { - addService(nodeHealthScriptRunner); - } - addService(dirsHandler); - super.serviceInit(conf); - } - - /** - * @return the reporting string of health of the node - */ - String getHealthReport() { - String scriptReport = Strings.emptyToNull( - nodeHealthScriptRunner == null ? null : - nodeHealthScriptRunner.getHealthReport()); - String discReport = - Strings.emptyToNull( - dirsHandler.getDisksHealthReport(false)); - String exceptionReport = Strings.emptyToNull( - nodeHealthException == null ? null : - nodeHealthException.getMessage()); - - return Joiner.on(SEPARATOR).skipNulls() - .join(scriptReport, discReport, exceptionReport); - } - - /** - * @return true if the node is healthy - */ - boolean isHealthy() { - boolean scriptHealthy = nodeHealthScriptRunner == null || - nodeHealthScriptRunner.isHealthy(); - return nodeHealthException == null && - scriptHealthy && dirsHandler.areDisksHealthy(); - } - - /** - * @return when the last time the node health status is reported - */ - long getLastHealthReportTime() { - return Collections.max(Arrays.asList( - dirsHandler.getLastDisksCheckTime(), - nodeHealthScriptRunner == null ? 0 : - nodeHealthScriptRunner.getLastReportedTime(), - nodeHealthExceptionReportTime)); - } - - /** - * @return the disk handler - */ - public LocalDirsHandlerService getDiskHandler() { - return dirsHandler; - } - - /** - * @return the node health script runner - */ - NodeHealthScriptRunner getNodeHealthScriptRunner() { - return nodeHealthScriptRunner; - } - - /** - * Report an exception to mark the node as unhealthy. - * @param ex the exception that makes the node unhealthy - */ - void reportException(Exception ex) { - nodeHealthException = ex; - nodeHealthExceptionReportTime = System.currentTimeMillis(); - } -} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 17e65688041..f90423cf6b6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -33,7 +33,7 @@ import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.JvmPauseMonitor; -import org.apache.hadoop.util.NodeHealthScriptRunner; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.ShutdownHookManager; @@ -347,27 +347,6 @@ private void recoverTokens(NMTokenSecretManagerInNM nmTokenSecretManager, } } - public static NodeHealthScriptRunner getNodeHealthScriptRunner(Configuration conf) { - String nodeHealthScript = - conf.get(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH); - if(!NodeHealthScriptRunner.shouldRun(nodeHealthScript)) { - LOG.info("Node Manager health check script is not available " - + "or doesn't have execute permission, so not " - + "starting the node health script runner."); - return null; - } - long nmCheckintervalTime = conf.getLong( - YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, - YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS); - long scriptTimeout = conf.getLong( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, - YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS); - String[] scriptArgs = conf.getStrings( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS, new String[] {}); - return new NodeHealthScriptRunner(nodeHealthScript, - nmCheckintervalTime, scriptTimeout, scriptArgs); - } - @VisibleForTesting protected ResourcePluginManager createResourcePluginManager() { return new ResourcePluginManager(); @@ -431,12 +410,9 @@ protected void serviceInit(Configuration conf) throws Exception { // NodeManager level dispatcher this.dispatcher = createNMDispatcher(); - nodeHealthChecker = - new NodeHealthCheckerService( - getNodeHealthScriptRunner(conf), dirsHandler); + this.nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); addService(nodeHealthChecker); - ((NMContext)context).setContainerExecutor(exec); ((NMContext)context).setDeletionService(del); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 181094ea6c6..56427aacc2d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -86,6 +86,8 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.health.ExceptionReporter; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -855,7 +857,7 @@ private boolean handleShutdownOrResyncCommand( @Override public void reportException(Exception ex) { - healthChecker.reportException(ex); + healthChecker.getExceptionReporter().reportException(ex); sendOutofBandHeartBeat(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/ExceptionReporter.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/ExceptionReporter.java new file mode 100644 index 00000000000..828df03fa53 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/ExceptionReporter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +/** + * Simple {@link HealthReporter} implementation which reports whether a fatal + * exception has happened in the NodeManager. + * + * See the reportException call of + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl} + */ +public class ExceptionReporter implements HealthReporter { + private Exception nodeHealthException; + private long nodeHealthExceptionReportTime; + + ExceptionReporter() { + this.nodeHealthException = null; + this.nodeHealthExceptionReportTime = 0; + } + + @Override + public synchronized boolean isHealthy() { + return nodeHealthException == null; + } + + @Override + public synchronized String getHealthReport() { + return nodeHealthException == null ? null : + nodeHealthException.getMessage(); + } + + @Override + public synchronized long getLastHealthReportTime() { + return nodeHealthExceptionReportTime; + } + + /** + * Report an exception to mark the node as unhealthy. + * @param ex the exception that makes the node unhealthy + */ + public synchronized void reportException(Exception ex) { + nodeHealthException = ex; + nodeHealthExceptionReportTime = System.currentTimeMillis(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java new file mode 100644 index 00000000000..aef4a83da24 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +/** + * Interface providing information about the health of a service. + * + * Associated pieces of information: + * + * + * Classes implementing this interface are used in + * {@link NodeHealthCheckerService}. + * + * Developers are discouraged to implement new Java-based health scripts, + * they should rather try to implement it as a script and use the + * {@link NodeHealthScriptRunner} implementation. + * + * @see TimedHealthReporterService + * @see org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService + */ +public interface HealthReporter { + + /** + * Gets whether the node is healthy or not. + * + * @return true if node is healthy + */ + boolean isHealthy(); + + /** + * Returns output from health check. If node is healthy then an empty string + * is returned. + * + * @return output from health check + */ + String getHealthReport(); + + /** + * Returns time stamp when node health check was last run. + * + * @return timestamp when node health script was last run + */ + long getLastHealthReportTime(); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java new file mode 100644 index 00000000000..02e98e7cd5a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java @@ -0,0 +1,163 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Strings; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.CompositeService; +import org.apache.hadoop.service.Service; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * This class provides functionality of checking the health of a node and + * reporting back to the service for which the health checker has been asked to + * report. + * + * It is a {@link CompositeService}: every {@link Service} must be registered + * first in serviceInit, and should also implement the {@link HealthReporter} + * interface - otherwise an exception is thrown. + * + * Calling functions of HealthReporter merge its dependent + * services' reports. + * + * @see HealthReporter + * @see LocalDirsHandlerService + * @see TimedHealthReporterService + */ +public class NodeHealthCheckerService extends CompositeService + implements HealthReporter { + + public static final Logger LOG = + LoggerFactory.getLogger(NodeHealthCheckerService.class); + public static final int MAX_SCRIPTS = 4; + + private List reporters; + private LocalDirsHandlerService dirsHandler; + private ExceptionReporter exceptionReporter; + + public static final String SEPARATOR = ";"; + + public NodeHealthCheckerService( + LocalDirsHandlerService dirHandlerService) { + super(NodeHealthCheckerService.class.getName()); + + this.reporters = new ArrayList<>(); + this.dirsHandler = dirHandlerService; + this.exceptionReporter = new ExceptionReporter(); + } + + @Override + protected void serviceInit(Configuration conf) throws Exception { + reporters.add(exceptionReporter); + addHealthReporter(dirsHandler); + String[] configuredScripts = conf.getTrimmedStrings( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPTS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPTS); + if (configuredScripts.length > MAX_SCRIPTS) { + throw new IllegalArgumentException("Due to performance reasons " + + "running more than " + MAX_SCRIPTS + "scripts is not allowed."); + } + for (String configuredScript : configuredScripts) { + addHealthReporter(NodeHealthScriptRunner.newInstance( + configuredScript, conf)); + } + super.serviceInit(conf); + } + + /** + * Adds a {@link Service} implementing the {@link HealthReporter} interface, + * if that service has not been added to this {@link CompositeService} yet. + * + * @param service to add + * @throws Exception if not a {@link HealthReporter} + * implementation is provided to this function + */ + @VisibleForTesting + void addHealthReporter(Service service) throws Exception { + if (service != null) { + if (getServices().stream() + .noneMatch(x -> x.getName().equals(service.getName()))) { + if (!(service instanceof HealthReporter)) { + throw new Exception("Attempted to add service to " + + "NodeHealthCheckerService that does not implement " + + "HealthReporter."); + } + reporters.add((HealthReporter) service); + addService(service); + } else { + LOG.debug("Omitting duplicate service."); + } + } + } + + /** + * Joining the health reports of the dependent services. + * + * @return the report string about the health of the node + */ + @Override + public String getHealthReport() { + ArrayList reports = reporters.stream() + .map(reporter -> Strings.emptyToNull(reporter.getHealthReport())) + .collect(Collectors.toCollection(ArrayList::new)); + return Joiner.on(SEPARATOR).skipNulls().join(reports); + } + + /** + * @return true if the node is healthy + */ + @Override + public boolean isHealthy() { + return reporters.stream().allMatch(HealthReporter::isHealthy); + } + + /** + * @return when the last time the node health status is reported + */ + @Override + public long getLastHealthReportTime() { + Optional max = reporters.stream() + .map(HealthReporter::getLastHealthReportTime).max(Long::compareTo); + return max.orElse(0L); + } + + /** + * @return the disk handler + */ + public LocalDirsHandlerService getDiskHandler() { + return dirsHandler; + } + + /** + * @return the {@link ExceptionReporter} instance + */ + public ExceptionReporter getExceptionReporter() { + return exceptionReporter; + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java similarity index 56% rename from hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java index f2a5b242a8d..4256049af65 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.util; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.IOException; @@ -27,51 +27,87 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.Shell.ExitCodeException; import org.apache.hadoop.util.Shell.ShellCommandExecutor; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * * The class which provides functionality of checking the health of the node * using the configured node health script and reporting back to the service * for which the health checker has been asked to report. */ -public class NodeHealthScriptRunner extends AbstractService { +public final class NodeHealthScriptRunner extends TimedHealthReporterService { private static final Logger LOG = LoggerFactory.getLogger(NodeHealthScriptRunner.class); /** Absolute path to the health script. */ private String nodeHealthScript; - /** Delay after which node health script to be executed */ - private long intervalTime; - /** Time after which the script should be timedout */ + /** Time after which the script should be timed out. */ private long scriptTimeout; - /** Timer used to schedule node health monitoring script execution */ - private Timer nodeHealthScriptScheduler; + /** ShellCommandExecutor used to execute monitoring script. */ + private ShellCommandExecutor shexec = null; - /** ShellCommandExecutor used to execute monitoring script */ - ShellCommandExecutor shexec = null; + /** Pattern used for searching in the output of the node health script. */ + private static final String ERROR_PATTERN = "ERROR"; - /** Pattern used for searching in the output of the node health script */ - static private final String ERROR_PATTERN = "ERROR"; + /** Time out error message. */ + static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = + "Node health script timed out"; - /** Time out error message */ - public static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out"; + private NodeHealthScriptRunner(String scriptName, long checkInterval, + long timeout, String[] scriptArgs) { + super(NodeHealthScriptRunner.class.getName(), checkInterval); + this.nodeHealthScript = scriptName; + this.scriptTimeout = timeout; + setTimerTask(new NodeHealthMonitorExecutor(scriptArgs)); + } - private boolean isHealthy; + public static NodeHealthScriptRunner newInstance(String scriptName, + Configuration conf) { + String nodeHealthScriptsConfig = String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE, scriptName); + String nodeHealthScript = conf.get(nodeHealthScriptsConfig); + if (!shouldRun(scriptName, nodeHealthScript)) { + return null; + } - private String healthReport; + // Determine check interval ms + String checkIntervalMsConfig = String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE, + scriptName); + long checkIntervalMs = conf.getLong(checkIntervalMsConfig, 0L); + if (checkIntervalMs == 0L) { + checkIntervalMs = conf.getLong( + YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS); + } - private long lastReportedTime; + // Determine time out + String scriptTimeoutConfig = String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE, + scriptName); + long scriptTimeout = conf.getLong(scriptTimeoutConfig, 0L); + if (scriptTimeout == 0L) { + scriptTimeout = conf.getLong( + YarnConfiguration.NM_HEALTH_CHECK_TIMEOUT_MS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_TIMEOUT_MS); + } + + // Determine script arguments + String scriptArgsConfig = String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS_TEMPLATE, + scriptName); + String[] scriptArgs = conf.getStrings(scriptArgsConfig, new String[]{}); + + return new NodeHealthScriptRunner(nodeHealthScript, + checkIntervalMs, scriptTimeout, scriptArgs); + } - private TimerTask timer; - private enum HealthCheckerExitStatus { SUCCESS, TIMED_OUT, @@ -84,13 +120,11 @@ /** * Class which is used by the {@link Timer} class to periodically execute the * node health script. - * */ private class NodeHealthMonitorExecutor extends TimerTask { + private String exceptionStackTrace = ""; - String exceptionStackTrace = ""; - - public NodeHealthMonitorExecutor(String[] args) { + NodeHealthMonitorExecutor(String[] args) { ArrayList execScript = new ArrayList(); execScript.add(nodeHealthScript); if (args != null) { @@ -134,27 +168,29 @@ public void run() { /** * Method which is used to parse output from the node health monitor and * send to the report address. - * + * * The timed out script or script which causes IOException output is * ignored. - * + * * The node is marked unhealthy if *
    *
  1. The node health script times out
  2. - *
  3. The node health scripts output has a line which begins with ERROR
  4. + *
  5. The node health scripts output has a line which begins + * with ERROR
  6. *
  7. An exception is thrown while executing the script
  8. *
* If the script throws {@link IOException} or {@link ExitCodeException} the * output is ignored and node is left remaining healthy, as script might * have syntax error. - * + * * @param status */ void reportHealthStatus(HealthCheckerExitStatus status) { - long now = System.currentTimeMillis(); switch (status) { case SUCCESS: - setHealthStatus(true, "", now); + case FAILED_WITH_EXIT_CODE: + // see Javadoc above - we don't report bad health intentionally + setHealthStatus(true, ""); break; case TIMED_OUT: setHealthStatus(false, NODE_HEALTH_SCRIPT_TIMED_OUT_MSG); @@ -162,21 +198,19 @@ void reportHealthStatus(HealthCheckerExitStatus status) { case FAILED_WITH_EXCEPTION: setHealthStatus(false, exceptionStackTrace); break; - case FAILED_WITH_EXIT_CODE: - // see Javadoc above - we don't report bad health intentionally - setHealthStatus(true, "", now); - break; case FAILED: setHealthStatus(false, shexec.getOutput()); break; + default: + LOG.warn("Unknown HealthCheckerExitStatus - ignored."); + break; } } /** * Method to check if the output string has line which begins with ERROR. - * - * @param output - * string + * + * @param output the output of the node health script to process * @return true if output string has error pattern in it. */ private boolean hasErrors(String output) { @@ -190,150 +224,46 @@ private boolean hasErrors(String output) { } } - public NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout, - String[] scriptArgs) { - super(NodeHealthScriptRunner.class.getName()); - this.lastReportedTime = System.currentTimeMillis(); - this.isHealthy = true; - this.healthReport = ""; - this.nodeHealthScript = scriptName; - this.intervalTime = chkInterval; - this.scriptTimeout = timeout; - this.timer = new NodeHealthMonitorExecutor(scriptArgs); - } - - /* - * Method which initializes the values for the script path and interval time. - */ - @Override - protected void serviceInit(Configuration conf) throws Exception { - super.serviceInit(conf); - } - - /** - * Method used to start the Node health monitoring. - * - */ @Override - protected void serviceStart() throws Exception { - nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true); - // Start the timer task immediately and - // then periodically at interval time. - nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime); - super.serviceStart(); - } - - /** - * Method used to terminate the node health monitoring service. - * - */ - @Override - protected void serviceStop() { - if (nodeHealthScriptScheduler != null) { - nodeHealthScriptScheduler.cancel(); - } + public void serviceStop() throws Exception { if (shexec != null) { Process p = shexec.getProcess(); if (p != null) { p.destroy(); } } + super.serviceStop(); } /** - * Gets the if the node is healthy or not - * - * @return true if node is healthy - */ - public boolean isHealthy() { - return isHealthy; - } - - /** - * Sets if the node is healthy or not considering disks' health also. - * - * @param isHealthy - * if or not node is healthy - */ - private synchronized void setHealthy(boolean isHealthy) { - this.isHealthy = isHealthy; - } - - /** - * Returns output from health script. if node is healthy then an empty string - * is returned. - * - * @return output from health script - */ - public String getHealthReport() { - return healthReport; - } - - /** - * Sets the health report from the node health script. Also set the disks' - * health info obtained from DiskHealthCheckerService. + * Method used to determine whether the {@link NodeHealthScriptRunner} + * should be started or not.

+ * Returns true if following conditions are met: * - * @param healthReport - */ - private synchronized void setHealthReport(String healthReport) { - this.healthReport = healthReport; - } - - /** - * Returns time stamp when node health script was last run. - * - * @return timestamp when node health script was last run - */ - public long getLastReportedTime() { - return lastReportedTime; - } - - /** - * Sets the last run time of the node health script. - * - * @param lastReportedTime - */ - private synchronized void setLastReportedTime(long lastReportedTime) { - this.lastReportedTime = lastReportedTime; - } - - /** - * Method used to determine if or not node health monitoring service should be - * started or not. Returns true if following conditions are met: - * *

    *
  1. Path to Node health check script is not empty
  2. *
  3. Node health check script file exists
  4. *
- * + * * @return true if node health monitoring service can be started. */ - public static boolean shouldRun(String healthScript) { + static boolean shouldRun(String script, String healthScript) { if (healthScript == null || healthScript.trim().isEmpty()) { + LOG.info("Missing location for the node health check script \"{}\".", + script); return false; } File f = new File(healthScript); - return f.exists() && FileUtil.canExecute(f); - } - - private synchronized void setHealthStatus(boolean isHealthy, String output) { - LOG.info("health status being set as " + output); - this.setHealthy(isHealthy); - this.setHealthReport(output); - } - - private synchronized void setHealthStatus(boolean isHealthy, String output, - long time) { - LOG.info("health status being set as " + output); - this.setHealthStatus(isHealthy, output); - this.setLastReportedTime(time); - } - - /** - * Used only by tests to access the timer task directly - * @return the timer task - */ - public TimerTask getTimerTask() { - return timer; + if (!f.exists()) { + LOG.warn("File {} for script \"{}\" does not exist.", + healthScript, script); + return false; + } + if (!FileUtil.canExecute(f)) { + LOG.warn("File {} for script \"{}\" can not be executed.", + healthScript, script); + return false; + } + return true; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java new file mode 100644 index 00000000000..faceddbe707 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java @@ -0,0 +1,140 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.service.AbstractService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Timer; +import java.util.TimerTask; + +/** + * A {@link HealthReporter} skeleton for regularly checking a specific + * {@link TimerTask} and obtaining information about it. + * + * @see NodeHealthScriptRunner + */ +public abstract class TimedHealthReporterService extends AbstractService + implements HealthReporter { + + private static final Logger LOG = + LoggerFactory.getLogger(TimedHealthReporterService.class); + + private boolean isHealthy; + private String healthReport; + private long lastReportedTime; + + private Timer timer; + private TimerTask task; + private long intervalMs; + + TimedHealthReporterService(String name, long intervalMs) { + super(name); + this.isHealthy = true; + this.healthReport = ""; + this.lastReportedTime = System.currentTimeMillis(); + this.intervalMs = intervalMs; + } + + @VisibleForTesting + void setTimerTask(TimerTask timerTask) { + task = timerTask; + } + + @VisibleForTesting + TimerTask getTimerTask() { + return task; + } + + /** + * Method used to start the health monitoring. + */ + @Override + public void serviceStart() throws Exception { + if (task == null) { + throw new Exception("Health reporting task hasn't been set!"); + } + timer = new Timer("HealthReporterService-Timer", true); + timer.scheduleAtFixedRate(task, 0, intervalMs); + super.serviceStart(); + } + + /** + * Method used to terminate the health monitoring service. + */ + @Override + protected void serviceStop() throws Exception { + if (timer != null) { + timer.cancel(); + } + super.serviceStop(); + } + + @Override + public boolean isHealthy() { + return isHealthy; + } + + /** + * Sets if the node is healthy or not. + * + * @param healthy whether the node is healthy + */ + protected synchronized void setHealthy(boolean healthy) { + this.isHealthy = healthy; + } + + @Override + public String getHealthReport() { + return healthReport; + } + + /** + * Sets the health report from the node health check. Also set the disks' + * health info obtained from DiskHealthCheckerService. + * + * @param report report String + */ + private synchronized void setHealthReport(String report) { + this.healthReport = report; + } + + @Override + public long getLastHealthReportTime() { + return lastReportedTime; + } + + /** + * Sets the last run time of the node health check. + * + * @param lastReportedTime last reported time in long + */ + private synchronized void setLastReportedTime(long lastReportedTime) { + this.lastReportedTime = lastReportedTime; + } + + synchronized void setHealthStatus(boolean healthy, String output) { + LOG.info("Health status being set as: \"" + output + "\"."); + this.setHealthy(healthy); + this.setHealthReport(output); + this.setLastReportedTime(System.currentTimeMillis()); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java index 2e80259d210..b8f623a36b5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java @@ -38,6 +38,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java index 13b3ee91bdc..fcb76f9e75f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.junit.Assert; import org.junit.Before; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java index 54e090a29e2..b1fc2f1aa26 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java @@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; @@ -102,8 +103,8 @@ public int getHttpPort() { DeletionService del = new DeletionService(exec); Dispatcher dispatcher = new AsyncDispatcher(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); NodeManagerMetrics metrics = NodeManagerMetrics.create(); NodeStatusUpdater nodeStatusUpdater = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java index fbd3646940d..260c3c4144c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java @@ -64,6 +64,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.deletion.task.FileDeletionMatcher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java index 25cca876ac6..9eae82a9322 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java @@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java index 25dbc1dd2ea..9a0213d87cf 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java @@ -66,6 +66,7 @@ import org.apache.hadoop.yarn.security.NMTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.ConverterUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 1b21b936543..c0831ee022d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager; import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -107,6 +108,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; @@ -1994,4 +1996,21 @@ protected NodeStatusUpdater createNodeStatusUpdater(Context context, } }; } + + @Test + public void testExceptionReported() { + nm = new NodeManager(); + YarnConfiguration conf = new YarnConfiguration(); + nm.init(conf); + NodeStatusUpdater nodeStatusUpdater = nm.getNodeStatusUpdater(); + NodeHealthCheckerService nodeHealthChecker = nm.getNodeHealthChecker(); + + assertThat(nodeHealthChecker.isHealthy()).isTrue(); + + String message = "exception message"; + Exception e = new Exception(message); + nodeStatusUpdater.reportException(e); + assertThat(nodeHealthChecker.isHealthy()).isFalse(); + assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(message); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java index 325d60c59be..072f4432c62 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java @@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java index a86ca3e8211..e3dce3b5b51 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java @@ -50,6 +50,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java index 09c836e596f..7a85bfab44e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java @@ -75,7 +75,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; @@ -218,8 +218,7 @@ public void setup() throws IOException { delSrvc.init(conf); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); nodeHealthChecker.init(conf); containerManager = createContainerManager(delSrvc); ((NMContext)context).setContainerManager(containerManager); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index de20abf6682..826cc02219b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -85,8 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -157,8 +156,7 @@ public void setup() throws IOException { delSrvc.init(conf); exec = createContainerExecutor(); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); nodeHealthChecker.init(conf); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java index 28f917fd842..d324ffbd2d1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java @@ -30,7 +30,6 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; @@ -44,6 +43,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.*; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.util.resource.TestResourceUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestExceptionReporter.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestExceptionReporter.java new file mode 100644 index 00000000000..b8f5e1a09d8 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestExceptionReporter.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.nodemanager.health; + +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Tests for the {@link ExceptionReporter} class. + */ +public class TestExceptionReporter { + @Test + public void testUnhealthy() { + ExceptionReporter reporter = new ExceptionReporter(); + assertThat(reporter.isHealthy()).isTrue(); + assertThat(reporter.getLastHealthReportTime()).isZero(); + + String message = "test"; + Exception exception = new Exception(message); + reporter.reportException(exception); + assertThat(reporter.isHealthy()).isFalse(); + assertThat(reporter.getHealthReport()).isEqualTo(message); + assertThat(reporter.getLastHealthReportTime()).isNotEqualTo(0); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthCheckerService.java similarity index 55% rename from hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthCheckerService.java index 8083a563773..2544d5b4a41 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthCheckerService.java @@ -16,12 +16,15 @@ * limitations under the License. */ -package org.apache.hadoop.yarn.server.nodemanager; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; + +import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,7 +34,6 @@ import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.factories.RecordFactory; @@ -42,58 +44,79 @@ import org.junit.Before; import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.fail; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.spy; -public class TestNodeHealthService { +/** + * Test class for {@link NodeHealthCheckerService}. + */ +public class TestNodeHealthCheckerService { - private static volatile Logger LOG = - LoggerFactory.getLogger(TestNodeHealthService.class); + private static final Logger LOG = + LoggerFactory.getLogger(TestNodeHealthCheckerService.class); - protected static File testRootDir = new File("target", - TestNodeHealthService.class.getName() + "-localDir").getAbsoluteFile(); + private static final File TEST_ROOT_DIR = new File("target", + TestNodeHealthCheckerService.class.getName() + "-localDir") + .getAbsoluteFile(); - final static File nodeHealthConfigFile = new File(testRootDir, + private static final File NODE_HEALTH_CONFIG_FILE = new File(TEST_ROOT_DIR, "modified-mapred-site.xml"); - private File nodeHealthscriptFile = new File(testRootDir, + private File nodeHealthscriptFile = new File(TEST_ROOT_DIR, Shell.appendScriptExtension("failingscript")); @Before public void setup() { - testRootDir.mkdirs(); + TEST_ROOT_DIR.mkdirs(); } @After public void tearDown() throws Exception { - if (testRootDir.exists()) { + if (TEST_ROOT_DIR.exists()) { FileContext.getLocalFSFileContext().delete( - new Path(testRootDir.getAbsolutePath()), true); + new Path(TEST_ROOT_DIR.getAbsolutePath()), true); } } - - private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) - throws IOException { + + private void writeNodeHealthScriptFile() throws IOException { PrintWriter pw = null; try { FileUtil.setWritable(nodeHealthscriptFile, true); FileUtil.setReadable(nodeHealthscriptFile, true); pw = new PrintWriter(new FileOutputStream(nodeHealthscriptFile)); - pw.println(scriptStr); + pw.println(""); pw.flush(); } finally { - pw.close(); + if (pw != null) { + pw.close(); + } } - FileUtil.setExecutable(nodeHealthscriptFile, setExecutable); + FileUtil.setExecutable(nodeHealthscriptFile, true); } - private Configuration getConfForNodeHealthScript() { + private Configuration getConfForNodeHealthScript(String scriptName) { Configuration conf = new Configuration(); - conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH, - nodeHealthscriptFile.getAbsolutePath()); - conf.setLong(YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, 500); - conf.setLong( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, 1000); + conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPTS, scriptName); + String timeoutConfig = + String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE, + scriptName); + conf.setLong(timeoutConfig, 1000L); + + String intervalConfig = + String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE, + scriptName); + conf.setLong(intervalConfig, 500L); + + String pathConfig = + String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE, + scriptName); + conf.set(pathConfig, nodeHealthscriptFile.getAbsolutePath()); + return conf; } @@ -109,16 +132,22 @@ public void testNodeHealthService() throws Exception { RecordFactory factory = RecordFactoryProvider.getRecordFactory(null); NodeHealthStatus healthStatus = factory.newRecordInstance(NodeHealthStatus.class); - Configuration conf = getConfForNodeHealthScript(); - conf.writeXml(new FileOutputStream(nodeHealthConfigFile)); - conf.addResource(nodeHealthConfigFile.getName()); - writeNodeHealthScriptFile("", true); + String scriptName = "test"; + Configuration conf = getConfForNodeHealthScript(scriptName); + conf.writeXml(new FileOutputStream(NODE_HEALTH_CONFIG_FILE)); + conf.addResource(NODE_HEALTH_CONFIG_FILE.getName()); + writeNodeHealthScriptFile(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthScriptRunner nodeHealthScriptRunner = - spy(NodeManager.getNodeHealthScriptRunner(conf)); - NodeHealthCheckerService nodeHealthChecker = new NodeHealthCheckerService( - nodeHealthScriptRunner, dirsHandler); + NodeHealthScriptRunner.newInstance(scriptName, conf); + if (nodeHealthScriptRunner == null) { + fail("Should have created NodeHealthScriptRunner instance"); + } + nodeHealthScriptRunner = spy(nodeHealthScriptRunner); + NodeHealthCheckerService nodeHealthChecker = + new NodeHealthCheckerService(dirsHandler); + nodeHealthChecker.addHealthReporter(nodeHealthScriptRunner); nodeHealthChecker.init(conf); doReturn(true).when(nodeHealthScriptRunner).isHealthy(); @@ -133,7 +162,7 @@ public void testNodeHealthService() throws Exception { Assert.assertTrue("Node health status reported unhealthy", healthStatus .getHealthReport().equals(nodeHealthChecker.getHealthReport())); - doReturn(false).when(nodeHealthScriptRunner).isHealthy(); + doReturn(false).when(nodeHealthScriptRunner).isHealthy(); // update health status setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(), nodeHealthChecker.getHealthReport(), @@ -174,4 +203,64 @@ public void testNodeHealthService() throws Exception { .getDisksHealthReport(false)) ))); } + + private abstract class HealthReporterService extends AbstractService + implements HealthReporter { + HealthReporterService() { + super(HealthReporterService.class.getName()); + } + } + + @Test + public void testCustomHealthReporter() throws Exception { + String healthReport = "dummy health report"; + HealthReporterService customHealthReporter = new HealthReporterService() { + private int counter = 0; + + @Override + public boolean isHealthy() { + return counter++ % 2 == 0; + } + + @Override + public String getHealthReport() { + return healthReport; + } + + @Override + public long getLastHealthReportTime() { + return Long.MAX_VALUE; + } + }; + + Configuration conf = new Configuration(); + LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); + NodeHealthCheckerService nodeHealthChecker = + new NodeHealthCheckerService(dirsHandler); + nodeHealthChecker.addHealthReporter(customHealthReporter); + nodeHealthChecker.init(conf); + + assertThat(nodeHealthChecker.isHealthy()).isTrue(); + assertThat(nodeHealthChecker.isHealthy()).isFalse(); + assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(healthReport); + assertThat(nodeHealthChecker.getLastHealthReportTime()) + .isEqualTo(Long.MAX_VALUE); + } + + @Test + public void testExceptionReported() { + Configuration conf = new Configuration(); + LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); + NodeHealthCheckerService nodeHealthChecker = + new NodeHealthCheckerService(dirsHandler); + nodeHealthChecker.init(conf); + ExceptionReporter reporter = nodeHealthChecker.getExceptionReporter(); + assertThat(nodeHealthChecker.isHealthy()).isTrue(); + + String message = "An exception was thrown."; + Exception exception = new Exception(message); + reporter.reportException(exception); + assertThat(nodeHealthChecker.isHealthy()).isFalse(); + assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(message); + } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java similarity index 58% rename from hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java index 2748c0b581a..35885ccdadb 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.util; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.FileOutputStream; @@ -28,14 +28,22 @@ import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Test class for {@link NodeHealthScriptRunner}. + */ public class TestNodeHealthScriptRunner { - protected static File testRootDir = new File("target", + private static File testRootDir = new File("target", TestNodeHealthScriptRunner.class.getName() + "-localDir").getAbsoluteFile(); @@ -55,8 +63,8 @@ public void tearDown() throws Exception { } } - private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) - throws IOException { + private void writeNodeHealthScriptFile(String scriptStr, + boolean setExecutable) throws IOException { PrintWriter pw = null; try { FileUtil.setWritable(nodeHealthscriptFile, true); @@ -70,20 +78,46 @@ private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) FileUtil.setExecutable(nodeHealthscriptFile, setExecutable); } + private NodeHealthScriptRunner createNodeHealthScript() { + String scriptName = "custom"; + + YarnConfiguration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPTS, scriptName); + String timeoutConfig = + String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE, + scriptName); + conf.setLong(timeoutConfig, 1000L); + + String intervalConfig = + String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE, + scriptName); + conf.setLong(intervalConfig, 500L); + + String pathConfig = + String.format( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE, + scriptName); + conf.set(pathConfig, nodeHealthscriptFile.getAbsolutePath()); + + return NodeHealthScriptRunner.newInstance("custom", conf); + } + @Test public void testNodeHealthScriptShouldRun() throws IOException { - Assert.assertFalse("Node health script should start", - NodeHealthScriptRunner.shouldRun( + assertFalse("Node health script should start", + NodeHealthScriptRunner.shouldRun("script", nodeHealthscriptFile.getAbsolutePath())); writeNodeHealthScriptFile("", false); // Node health script should not start if the node health script is not // executable. - Assert.assertFalse("Node health script should start", - NodeHealthScriptRunner.shouldRun( + assertFalse("Node health script should start", + NodeHealthScriptRunner.shouldRun("script", nodeHealthscriptFile.getAbsolutePath())); writeNodeHealthScriptFile("", true); - Assert.assertTrue("Node health script should start", - NodeHealthScriptRunner.shouldRun( + assertTrue("Node health script should start", + NodeHealthScriptRunner.shouldRun("script", nodeHealthscriptFile.getAbsolutePath())); } @@ -92,54 +126,53 @@ public void testNodeHealthScript() throws Exception { String errorScript = "echo ERROR\n echo \"Tracker not healthy\""; String normalScript = "echo \"I am all fine\""; String timeOutScript = - Shell.WINDOWS ? "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" - : "sleep 4\necho \"I am fine\""; + Shell.WINDOWS ? + "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" + : "sleep 4\necho \"I am fine\""; String exitCodeScript = "exit 127"; Configuration conf = new Configuration(); writeNodeHealthScriptFile(normalScript, true); - NodeHealthScriptRunner nodeHealthScriptRunner = new NodeHealthScriptRunner( - nodeHealthscriptFile.getAbsolutePath(), - 500, 1000, new String[] {}); + NodeHealthScriptRunner nodeHealthScriptRunner = createNodeHealthScript(); nodeHealthScriptRunner.init(conf); TimerTask timerTask = nodeHealthScriptRunner.getTimerTask(); timerTask.run(); // Normal Script runs successfully - Assert.assertTrue("Node health status reported unhealthy", + assertTrue("Node health status reported unhealthy", nodeHealthScriptRunner.isHealthy()); - Assert.assertEquals("", nodeHealthScriptRunner.getHealthReport()); + assertTrue(nodeHealthScriptRunner.getHealthReport().isEmpty()); // Error script. writeNodeHealthScriptFile(errorScript, true); // Run timer timerTask.run(); - Assert.assertFalse("Node health status reported healthy", + assertFalse("Node health status reported healthy", nodeHealthScriptRunner.isHealthy()); - Assert.assertTrue( + assertTrue( nodeHealthScriptRunner.getHealthReport().contains("ERROR")); - + // Healthy script. writeNodeHealthScriptFile(normalScript, true); timerTask.run(); - Assert.assertTrue("Node health status reported unhealthy", + assertTrue("Node health status reported unhealthy", nodeHealthScriptRunner.isHealthy()); - Assert.assertEquals("", nodeHealthScriptRunner.getHealthReport()); + assertTrue(nodeHealthScriptRunner.getHealthReport().isEmpty()); // Timeout script. writeNodeHealthScriptFile(timeOutScript, true); timerTask.run(); - Assert.assertFalse("Node health status reported healthy even after timeout", - nodeHealthScriptRunner.isHealthy()); - Assert.assertEquals( - NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG, - nodeHealthScriptRunner.getHealthReport()); + assertFalse("Node health status reported healthy even after timeout", + nodeHealthScriptRunner.isHealthy()); + assertEquals( + NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG, + nodeHealthScriptRunner.getHealthReport()); // Exit code 127 writeNodeHealthScriptFile(exitCodeScript, true); timerTask.run(); - Assert.assertTrue("Node health status reported unhealthy", + assertTrue("Node health status reported unhealthy", nodeHealthScriptRunner.isHealthy()); - Assert.assertEquals("", nodeHealthScriptRunner.getHealthReport()); + assertEquals("", nodeHealthScriptRunner.getHealthReport()); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java index ece1af4a260..71716da37e7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java @@ -43,7 +43,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -57,7 +56,6 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -65,6 +63,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.webapp.ContainerLogsPage.ContainersLogsBlock; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; @@ -79,10 +78,9 @@ public class TestContainerLogsPage { - private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } @Test(timeout=30000) @@ -92,7 +90,7 @@ public void testContainerLogDirs() throws IOException, YarnException { String logdirwithFile = absLogDir.toURI().toString(); Configuration conf = new Configuration(); conf.set(YarnConfiguration.NM_LOG_DIRS, logdirwithFile); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); NMContext nmContext = new NodeManager.NMContext(null, null, dirsHandler, @@ -215,7 +213,7 @@ public void testContainerLogPageAccess() throws IOException { "kerberos"); UserGroupInformation.setConfiguration(conf); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); // Add an application and the corresponding containers diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java index 0d618fde10f..1e636650463 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java @@ -20,14 +20,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.eclipse.jetty.websocket.api.Session; import org.eclipse.jetty.websocket.api.UpgradeRequest; @@ -105,8 +104,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService( - conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); @@ -120,12 +118,9 @@ public boolean isPmemCheckEnabled() { } } - private NodeHealthCheckerService createNodeHealthCheckerService( - Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner( - conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java index 0a71a9179bb..cbfaa177921 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java @@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -42,19 +41,19 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; -import org.apache.hadoop.yarn.util.ConverterUtils; + import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -79,10 +78,9 @@ public void tearDown() { FileUtil.fullyDelete(testLogDir); } - private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } private int startNMWebAppServer(String webAddr) { @@ -113,7 +111,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); @@ -176,7 +174,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java index ad17ae81322..d2903a9ab15 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java @@ -47,7 +47,6 @@ import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; @@ -57,6 +56,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; @@ -141,8 +141,8 @@ protected void configureServlets() { conf.set(YarnConfiguration.YARN_LOG_SERVER_WEBSERVICE_URL, LOGSERVICEWSADDR); dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); aclsManager = new ApplicationACLsManager(conf); nmContext = new NodeManager.NMContext(null, null, dirsHandler, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java index 3533d16849d..ab06c0f9f33 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java @@ -47,13 +47,13 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AppsInfo; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; @@ -104,8 +104,8 @@ protected void configureServlets() { conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java index 4ee63db8177..7ec8fcd47d3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java @@ -44,11 +44,11 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices; import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; @@ -124,8 +124,8 @@ public boolean isPmemCheckEnabled() { conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java index a99ce280381..175a0b02470 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java @@ -28,7 +28,6 @@ import java.io.File; import java.io.IOException; import java.io.StringReader; -import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -48,16 +47,15 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; -import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; import org.apache.hadoop.yarn.webapp.GuiceServletConfig; import org.apache.hadoop.yarn.webapp.JerseyTestBase; @@ -131,8 +129,8 @@ public boolean isPmemCheckEnabled() { conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java index ec7d62c803c..d4180e48251 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java @@ -26,13 +26,12 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.http.JettyUtils; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.junit.After; import org.junit.Before; @@ -54,12 +53,9 @@ private WebServer server; private int port; - private NodeHealthCheckerService createNodeHealthCheckerService( - Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager - .getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } private int startNMWebAppServer(String webAddr) { @@ -90,7 +86,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, TESTLOGDIR.getAbsolutePath()); - healthChecker = createNodeHealthCheckerService(conf); + healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java index 68d97ee32ac..01045342fc1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java @@ -70,7 +70,6 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl; @@ -80,8 +79,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl; - - +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md index e4ed57f5cb4..7b3082bf90b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md @@ -42,9 +42,9 @@ The following configuration parameters can be used to modify the disk checks: | `yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage` | Float between 0-100 | The maximum percentage of disk space that may be utilized before a disk is marked as unhealthy by the disk checker service. This check is run for every disk used by the NodeManager. The default value is 90 i.e. 90% of the disk can be used. | | `yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb` | Integer | The minimum amount of free space that must be available on the disk for the disk checker service to mark the disk as healthy. This check is run for every disk used by the NodeManager. The default value is 0 i.e. the entire disk can be used. | -###External Health Script +### External Health Script -Users may specify their own health checker script that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that: +Users may specify their own health checker scripts that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that: * Exit code other than 0 is **not** considered to be a failure because it might have been caused by a syntax error. Therefore the node will **not** be marked as unhealthy. @@ -52,15 +52,24 @@ Users may specify their own health checker script that will be invoked by the he * Specifying a health check script is not mandatory. If no script is specified, only the disk checker status will be used to determine the health of the node. -The following configuration parameters can be used to set the health script: +Users can specify up to 4 scripts to run individually with the `yarn.nodemanager.health-checker.script` configuration. Also these options can be configured for all scripts (global configurations): | Configuration Name | Allowed Values | Description | |:---- |:---- |:---- | +|`yarn.nodemanager.health-checker.script`| String | The keywords for the health checker scripts separated by a comma. The default is "script". | | `yarn.nodemanager.health-checker.interval-ms` | Postive integer | The interval, in milliseconds, at which health checker service runs; the default value is 10 minutes. | -| `yarn.nodemanager.health-checker.script.timeout-ms` | Postive integer | The timeout for the health script that's executed; the default value is 20 minutes. | -| `yarn.nodemanager.health-checker.script.path` | String | Absolute path to the health check script to be run. | -| `yarn.nodemanager.health-checker.script.opts` | String | Arguments to be passed to the script when the script is executed. | +| `yarn.nodemanager.health-checker.timeout-ms` | Postive integer | The timeout for the health script that's executed; the default value is 20 minutes. | +The following options can be set for every health checker script. The %s symbol is substituted with each keyword provided in `yarn.nodemanager.health-checker.script`. + +| Configuration Name | Allowed Values | Description | +|:---- |:---- |:---- | +| `yarn.nodemanager.health-checker.%s.path` | String | Absolute path to the health check script to be run. Mandatory argument for each script. | +| `yarn.nodemanager.health-checker.%s.opts` | String | Arguments to be passed to the script when the script is executed. Mandatory argument for each script. | +| `yarn.nodemanager.health-checker.%s.interval-ms` | Postive integer | The interval, in milliseconds, at which health checker service runs. | +| `yarn.nodemanager.health-checker.%s.timeout-ms` | Postive integer | The timeout for the health script that's executed. | + +The interval and timeout options are not required to be specified. In that case the global configurations will be used. NodeManager Restart ------------------- -- 2.21.0