From eafb6887958e0b018d313f8ac5b830cb21f0b013 Mon Sep 17 00:00:00 2001 From: Adam Antal Date: Thu, 7 Nov 2019 19:22:09 +0100 Subject: [PATCH] YARN-9923. Detect missing Docker binary or not running Docker daemon --- .../dev-support/findbugs-exclude.xml | 22 +- .../hadoop/yarn/conf/YarnConfiguration.java | 18 ++ .../src/main/resources/yarn-default.xml | 18 ++ .../nodemanager/LocalDirsHandlerService.java | 21 +- .../nodemanager/NodeHealthCheckerService.java | 123 ---------- .../yarn/server/nodemanager/NodeManager.java | 28 +-- .../nodemanager/NodeStatusUpdaterImpl.java | 1 + .../health/DockerHealthCheckerService.java | 189 +++++++++++++++ .../nodemanager/health/HealthReporter.java | 60 +++++ .../health/NodeHealthCheckerService.java | 160 ++++++++++++ .../health}/NodeHealthScriptRunner.java | 228 +++++------------- .../health/TimedHealthReporterService.java | 144 +++++++++++ .../nodemanager/MockNodeStatusUpdater.java | 2 + .../nodemanager/NodeManagerTestBase.java | 1 + .../server/nodemanager/TestEventFlow.java | 5 +- .../nodemanager/TestNodeManagerReboot.java | 1 + .../nodemanager/TestNodeManagerResync.java | 1 + .../nodemanager/TestNodeManagerShutdown.java | 1 + .../nodemanager/TestNodeStatusUpdater.java | 1 + .../TestNodeStatusUpdaterForAttributes.java | 1 + .../TestNodeStatusUpdaterForLabels.java | 1 + .../BaseContainerManagerTest.java | 5 +- .../TestContainerManagerRecovery.java | 7 +- .../TestResourcePluginManager.java | 2 +- .../TestDockerHealthCheckerService.java | 162 +++++++++++++ .../TestNodeHealthCheckerService.java} | 107 ++++++-- .../health}/TestNodeHealthScriptRunner.java | 21 +- .../webapp/TestContainerLogsPage.java | 6 +- .../webapp/TestNMContainerWebSocket.java | 13 +- .../nodemanager/webapp/TestNMWebServer.java | 13 +- .../nodemanager/webapp/TestNMWebServices.java | 6 +- .../webapp/TestNMWebServicesApps.java | 4 +- .../webapp/TestNMWebServicesAuxServices.java | 4 +- .../webapp/TestNMWebServicesContainers.java | 6 +- .../nodemanager/webapp/TestNMWebTerminal.java | 12 +- .../hadoop/yarn/server/MiniYARNCluster.java | 2 +- .../src/site/markdown/NodeManager.md | 11 + 37 files changed, 1007 insertions(+), 400 deletions(-) delete mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java rename {hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util => hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health}/NodeHealthScriptRunner.java (56%) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java rename hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/{TestNodeHealthService.java => health/TestNodeHealthCheckerService.java} (66%) rename {hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util => hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health}/TestNodeHealthScriptRunner.java (90%) diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index e3149f079c6e765f4343868df211ad223b334ba6..610c66dce3b3f33e2333cf4fa88ac4645ce0b241 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -662,11 +662,17 @@ - + + + + + + + @@ -701,4 +707,18 @@ + + + + + + + + + + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index d9840ac9999e0396cd627551ee48f1aeb33ba479..0185541ea61933cf3e36f714a17024bcd70c330b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1977,6 +1977,24 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_HEALTH_CHECK_SCRIPT_OPTS = NM_PREFIX + "health-checker.script.opts"; + public static final String NM_DOCKER_HEALTH_CHECKER_PREFIX = + NM_PREFIX + "docker-health-checker."; + + public static final String NM_DOCKER_HEALTH_CHECKER_ENABLE = + NM_DOCKER_HEALTH_CHECKER_PREFIX + "enable"; + public static final boolean DEFAULT_NM_DOCKER_HEALTH_CHECKER_ENABLE = + false; + + public static final String NM_DOCKER_HEALTH_CHECKER_STARTUP = + NM_DOCKER_HEALTH_CHECKER_PREFIX + "startup"; + public static final boolean DEFAULT_NM_DOCKER_HEALTH_CHECKER_STARTUP = + false; + + public static final String NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS = + NM_DOCKER_HEALTH_CHECKER_PREFIX + "interval-ms"; + public static final long DEFAULT_NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS = + DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS; + /** The JVM options used on forking ContainerLocalizer process by container executor. */ public static final String NM_CONTAINER_LOCALIZER_JAVA_OPTS_KEY = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 907f290afad2c83ae0a1dd388a5578020d7fb66e..3434ef8f025ba6824d5b503b8d791df23b201fd7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1619,6 +1619,24 @@ + + + yarn.nodemanager.docker-health-checker.enable + false + + + + + yarn.nodemanager.docker-health-checker.startup + false + + + + + yarn.nodemanager.docker-health-checker.interval-ms + 600000 + + Frequency of running disk health checker code. yarn.nodemanager.disk-health-checker.interval-ms diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 8d060b01adf0fdcae56f6d1ed562ddf87ea6c7e7..fee88478e017f63bfd61944833522368a478e228 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -29,9 +29,11 @@ import java.util.TimerTask; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskValidator; import org.apache.hadoop.util.DiskValidatorFactory; +import org.apache.hadoop.yarn.server.nodemanager.health.HealthReporter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,7 +44,6 @@ import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; @@ -54,7 +55,8 @@ * directories of a node. This specifically manages nodemanager-local-dirs and * nodemanager-log-dirs by periodically checking their health. */ -public class LocalDirsHandlerService extends AbstractService { +public class LocalDirsHandlerService extends AbstractService + implements HealthReporter { private static final Logger LOG = LoggerFactory.getLogger(LocalDirsHandlerService.class); @@ -426,6 +428,11 @@ public String getDisksHealthReport(boolean listGoodDirs) { } + @Override + public String getHealthReport() { + return getDisksHealthReport(false); + } + /** * The minimum fraction of number of disks needed to be healthy for a node to * be considered healthy in terms of disks is configured using @@ -457,10 +464,20 @@ public boolean areDisksHealthy() { return true; } + @Override + public boolean isHealthy() { + return areDisksHealthy(); + } + public long getLastDisksCheckTime() { return lastDisksCheckTime; } + @Override + public long getLastHealthReportTime() { + return getLastDisksCheckTime(); + } + public boolean isGoodLocalDir(String path) { return isInGoodDirs(getLocalDirs(), path); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java deleted file mode 100644 index 7e2fc7e022dc82ae732f3b9deb8f515727a49f39..0000000000000000000000000000000000000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java +++ /dev/null @@ -1,123 +0,0 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.hadoop.yarn.server.nodemanager; - -import com.google.common.base.Joiner; -import com.google.common.base.Strings; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.service.CompositeService; -import org.apache.hadoop.util.NodeHealthScriptRunner; - -import java.util.Arrays; -import java.util.Collections; - -/** - * The class which provides functionality of checking the health of the node and - * reporting back to the service for which the health checker has been asked to - * report. - */ -public class NodeHealthCheckerService extends CompositeService { - - private NodeHealthScriptRunner nodeHealthScriptRunner; - private LocalDirsHandlerService dirsHandler; - private Exception nodeHealthException; - private long nodeHealthExceptionReportTime; - - static final String SEPARATOR = ";"; - - public NodeHealthCheckerService(NodeHealthScriptRunner scriptRunner, - LocalDirsHandlerService dirHandlerService) { - super(NodeHealthCheckerService.class.getName()); - nodeHealthScriptRunner = scriptRunner; - dirsHandler = dirHandlerService; - nodeHealthException = null; - nodeHealthExceptionReportTime = 0; - } - - @Override - protected void serviceInit(Configuration conf) throws Exception { - if (nodeHealthScriptRunner != null) { - addService(nodeHealthScriptRunner); - } - addService(dirsHandler); - super.serviceInit(conf); - } - - /** - * @return the reporting string of health of the node - */ - String getHealthReport() { - String scriptReport = Strings.emptyToNull( - nodeHealthScriptRunner == null ? null : - nodeHealthScriptRunner.getHealthReport()); - String discReport = - Strings.emptyToNull( - dirsHandler.getDisksHealthReport(false)); - String exceptionReport = Strings.emptyToNull( - nodeHealthException == null ? null : - nodeHealthException.getMessage()); - - return Joiner.on(SEPARATOR).skipNulls() - .join(scriptReport, discReport, exceptionReport); - } - - /** - * @return true if the node is healthy - */ - boolean isHealthy() { - boolean scriptHealthy = nodeHealthScriptRunner == null || - nodeHealthScriptRunner.isHealthy(); - return nodeHealthException == null && - scriptHealthy && dirsHandler.areDisksHealthy(); - } - - /** - * @return when the last time the node health status is reported - */ - long getLastHealthReportTime() { - return Collections.max(Arrays.asList( - dirsHandler.getLastDisksCheckTime(), - nodeHealthScriptRunner == null ? 0 : - nodeHealthScriptRunner.getLastReportedTime(), - nodeHealthExceptionReportTime)); - } - - /** - * @return the disk handler - */ - public LocalDirsHandlerService getDiskHandler() { - return dirsHandler; - } - - /** - * @return the node health script runner - */ - NodeHealthScriptRunner getNodeHealthScriptRunner() { - return nodeHealthScriptRunner; - } - - /** - * Report an exception to mark the node as unhealthy. - * @param ex the exception that makes the node unhealthy - */ - void reportException(Exception ex) { - nodeHealthException = ex; - nodeHealthExceptionReportTime = System.currentTimeMillis(); - } -} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 4bbae340a77f1e2588396197280159a608dba3be..5f48b3acbeea7653506adfbad49bf0cdc9d2d3f4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -33,7 +33,7 @@ import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.JvmPauseMonitor; -import org.apache.hadoop.util.NodeHealthScriptRunner; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.ShutdownHookManager; @@ -347,27 +347,6 @@ private void recoverTokens(NMTokenSecretManagerInNM nmTokenSecretManager, } } - public static NodeHealthScriptRunner getNodeHealthScriptRunner(Configuration conf) { - String nodeHealthScript = - conf.get(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH); - if(!NodeHealthScriptRunner.shouldRun(nodeHealthScript)) { - LOG.info("Node Manager health check script is not available " - + "or doesn't have execute permission, so not " - + "starting the node health script runner."); - return null; - } - long nmCheckintervalTime = conf.getLong( - YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, - YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS); - long scriptTimeout = conf.getLong( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, - YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS); - String[] scriptArgs = conf.getStrings( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS, new String[] {}); - return new NodeHealthScriptRunner(nodeHealthScript, - nmCheckintervalTime, scriptTimeout, scriptArgs); - } - @VisibleForTesting protected ResourcePluginManager createResourcePluginManager() { return new ResourcePluginManager(); @@ -431,12 +410,9 @@ protected void serviceInit(Configuration conf) throws Exception { // NodeManager level dispatcher this.dispatcher = createNMDispatcher(); - nodeHealthChecker = - new NodeHealthCheckerService( - getNodeHealthScriptRunner(conf), dirsHandler); + this.nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); addService(nodeHealthChecker); - ((NMContext)context).setContainerExecutor(exec); ((NMContext)context).setDeletionService(del); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 181094ea6c640621b9fae210d5680d133cc11fcc..5e3693ae9c1b6ca2197ad4b242aa8e426d9c80d6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java new file mode 100644 index 0000000000000000000000000000000000000000..5bdf8b779fe287b55672c0a082e455f1e1881d7e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java @@ -0,0 +1,189 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.Collection; +import java.util.List; +import java.util.TimerTask; + +/** + * A {@link TimedHealthReporterService} responsible for regularly checking + * the availability of the Docker daemon. + */ +public final class DockerHealthCheckerService + extends TimedHealthReporterService { + + private static final Logger LOG = + LoggerFactory.getLogger(DockerHealthCheckerService.class); + + private boolean startupMode; + + static final String PID_FILE_NAME = "docker.pid"; + static final String NO_PID_FILE = + "Unable to obtain pid file of Docker daemon"; + private static final String HEALTH_CHECK_DISABLED = + "Docker health checker service is disabled."; + + private DockerHealthCheckerService(boolean startupMode, long intervalMs) { + super(DockerHealthCheckerService.class.getName(), intervalMs); + + this.startupMode = startupMode; + setTimerTask(new DockerDaemonMonitorExecutor()); + } + + @VisibleForTesting + boolean getStartupMode() { + return startupMode; + } + + public static DockerHealthCheckerService newInstance(Configuration conf) { + boolean enabled = + conf.getBoolean(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_ENABLE, + YarnConfiguration.DEFAULT_NM_DOCKER_HEALTH_CHECKER_ENABLE); + if (!enabled) { + Collection runtimes = conf.getTrimmedStringCollection( + YarnConfiguration.LINUX_CONTAINER_RUNTIME_ALLOWED_RUNTIMES); + if (runtimes.contains("docker")) { + LOG.info(HEALTH_CHECK_DISABLED); + } else { + LOG.debug(HEALTH_CHECK_DISABLED); + } + return null; + } + LOG.info("Docker health checker service enabled"); + boolean startupMode = conf.getBoolean( + YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_STARTUP, + YarnConfiguration.DEFAULT_NM_DOCKER_HEALTH_CHECKER_STARTUP); + long intervalMs = conf.getLong( + YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS); + return new DockerHealthCheckerService(startupMode, intervalMs); + } + + class DockerDaemonMonitorExecutor extends TimerTask { + List getPossiblePidFileLocations() { + return ImmutableList.of( + new File("/var/run/" + PID_FILE_NAME), + new File("/var/" + PID_FILE_NAME), + new File("/run/" + PID_FILE_NAME)); + } + + File getDockerDaemonConf() { + return new File("/etc/docker/daemon.json"); + } + + @Override + public void run() { + if (checkDefaultLocations()) { + return; + } + LOG.debug("Not found any of the default pid files " + + "for the Docker service on the host system."); + + if (checkFromDockerDaemonConf()) { + return; + } + + LOG.info("Docker daemon is most probably not running. " + + "Setting unhealthy status."); + setHealthStatus(false, NO_PID_FILE); + } + + /** + * Check whether docker.pid file exists on the default location. + * If not, try to guess some other OS-specific possible default locations. + */ + private boolean checkDefaultLocations() { + List pidFiles = getPossiblePidFileLocations(); + for (File pidFile : pidFiles) { + if (pidFile.exists() && !pidFile.isDirectory()) { + setHealthStatus(true, ""); + return true; + } + } + return false; + } + + /** + * Check whether the pid file location is configured in the Docker + * daemon's json file. If exists try to check the existence of that file. + * + * @return whether the health status has been successfully + * set for the service + */ + private boolean checkFromDockerDaemonConf() { + File dockerDaemonConf = getDockerDaemonConf(); + if (dockerDaemonConf.exists() && dockerDaemonConf.isFile()) { + try { + String jsonString = FileUtils.readFileToString( + dockerDaemonConf, Charset.defaultCharset()); + JSONObject json = new JSONObject(jsonString); + try { + String pidFileLocation = json.getString("pidfile"); + if (pidFileLocation != null && !pidFileLocation.isEmpty()) { + File configuredPidFile = new File(pidFileLocation); + if (configuredPidFile.exists() && + !configuredPidFile.isDirectory()) { + setHealthStatus(true, ""); + return true; + } else { + // give up trying, the pid file should be there, + // but it isn't + setHealthStatus(false, NO_PID_FILE); + return true; + } + } + } catch (JSONException excp) { + LOG.debug("The Docker daemon configuration file " + + "does not specify pidfile location."); + } + } catch (JSONException | IOException exception) { + LOG.warn("Failed to process Docker daemon configuration file."); + } + } + return false; + } + } + + @Override + public void serviceInit(Configuration conf) throws Exception { + super.serviceInit(conf); + if (startupMode) { + TimerTask task = new DockerDaemonMonitorExecutor(); + task.run(); + if (!isHealthy()) { + throw new Exception("Haven't detected running Docker daemon " + + "during startup!"); + } + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java new file mode 100644 index 0000000000000000000000000000000000000000..da56bcf83348d9ad96fda6ebd7e51dec1fe31251 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +/** + * Interface providing information about the healthiness of a service. + * + * Associated pieces of information: + *
    + *
  • whether the service is healthy
  • + *
  • report of the healthiness
  • + *
  • latest timestamp of the health check
  • + *
+ * + * Classes implementing this interface are used in + * {@link NodeHealthCheckerService}. + * + * @see TimedHealthReporterService + * @see org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService + */ +public interface HealthReporter { + + /** + * Gets whether the node is healthy or not. + * + * @return true if node is healthy + */ + boolean isHealthy(); + + /** + * Returns output from health check. If node is healthy then an empty string + * is returned. + * + * @return output from health check + */ + String getHealthReport(); + + /** + * Returns time stamp when node health check was last run. + * + * @return timestamp when node health script was last run + */ + long getLastHealthReportTime(); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java new file mode 100644 index 0000000000000000000000000000000000000000..cb20a8b1f3aa397ca28f4a06910457710a0a4f4e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java @@ -0,0 +1,160 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Strings; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.CompositeService; +import org.apache.hadoop.service.Service; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * This class provides functionality of checking the health of a node and + * reporting back to the service for which the health checker has been asked to + * report. + * + * It is a {@link CompositeService}: every {@link Service} must be registered + * first in serviceInit, and should also implement the {@link HealthReporter} + * interface - otherwise an exception is thrown. + * + * Calling functions of HealthReporter ensemble the dependent + * services' reports. + * + * @see HealthReporter + * @see LocalDirsHandlerService + */ +public class NodeHealthCheckerService extends CompositeService + implements HealthReporter { + + public static final Logger LOG = + LoggerFactory.getLogger(NodeHealthCheckerService.class); + + private List reporters; + private LocalDirsHandlerService dirsHandler; + private Exception nodeHealthException; + private long nodeHealthExceptionReportTime; + + public static final String SEPARATOR = ";"; + + public NodeHealthCheckerService(LocalDirsHandlerService dirHandlerService) { + super(NodeHealthCheckerService.class.getName()); + + this.reporters = new ArrayList<>(); + this.dirsHandler = dirHandlerService; + this.nodeHealthException = null; + this.nodeHealthExceptionReportTime = 0; + } + + @Override + protected void serviceInit(Configuration conf) throws Exception { + addHealthReporter(dirsHandler); + addHealthReporter(NodeHealthScriptRunner.newInstance(conf)); + addHealthReporter(DockerHealthCheckerService.newInstance(conf)); + + super.serviceInit(conf); + } + + /** + * Adds a {@link Service} implementing the {@link HealthReporter} interface, + * if that service has not been added to this {@link CompositeService} yet. + * + * @param service to add + * @throws Exception if not a {@link HealthReporter} + * implementation is provided to this function + */ + @VisibleForTesting + void addHealthReporter(Service service) throws Exception { + if (service != null) { + if (getServices().stream() + .noneMatch(x -> x.getName().equals(service.getName()))) { + if (!(service instanceof HealthReporter)) { + throw new Exception("Attempted to add service to " + + "NodeHealthCheckerService that does not implement " + + "HealthReporter."); + } + reporters.add((HealthReporter) service); + addService(service); + } else { + LOG.debug("Omitting duplicate service."); + } + } + } + + /** + * Joining the health reports of the dependent services. + * + * @return the report string about the health of the node + */ + @Override + public String getHealthReport() { + ArrayList reports = reporters.stream() + .map(reporter -> Strings.emptyToNull(reporter.getHealthReport())) + .collect(Collectors.toCollection(ArrayList::new)); + reports.add(Strings.emptyToNull( + nodeHealthException == null ? null : + nodeHealthException.getMessage())); + return Joiner.on(SEPARATOR).skipNulls().join(reports); + } + + /** + * @return true if the node is healthy + */ + @Override + public boolean isHealthy() { + return nodeHealthException == null && + reporters.stream().allMatch(HealthReporter::isHealthy); + } + + /** + * @return when the last time the node health status is reported + */ + @Override + public long getLastHealthReportTime() { + Optional max = reporters.stream() + .map(HealthReporter::getLastHealthReportTime).max(Long::compareTo); + return Long.max( + max.orElse(nodeHealthExceptionReportTime), + nodeHealthExceptionReportTime); + } + + /** + * @return the disk handler + */ + public LocalDirsHandlerService getDiskHandler() { + return dirsHandler; + } + + /** + * Report an exception to mark the node as unhealthy. + * @param ex the exception that makes the node unhealthy + */ + public void reportException(Exception ex) { + nodeHealthException = ex; + nodeHealthExceptionReportTime = System.currentTimeMillis(); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java similarity index 56% rename from hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java index f2a5b242a8d3736b0ab423ed9257648ddc87585e..a01f24583a061660eea7936c99e26699bb5eb480 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.util; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.IOException; @@ -27,51 +27,71 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.Shell.ExitCodeException; import org.apache.hadoop.util.Shell.ShellCommandExecutor; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * * The class which provides functionality of checking the health of the node * using the configured node health script and reporting back to the service * for which the health checker has been asked to report. */ -public class NodeHealthScriptRunner extends AbstractService { +public class NodeHealthScriptRunner extends TimedHealthReporterService { private static final Logger LOG = LoggerFactory.getLogger(NodeHealthScriptRunner.class); /** Absolute path to the health script. */ private String nodeHealthScript; - /** Delay after which node health script to be executed */ - private long intervalTime; - /** Time after which the script should be timedout */ + /** Time after which the script should be timed out. */ private long scriptTimeout; - /** Timer used to schedule node health monitoring script execution */ - private Timer nodeHealthScriptScheduler; + /** ShellCommandExecutor used to execute monitoring script. */ + private ShellCommandExecutor shexec = null; - /** ShellCommandExecutor used to execute monitoring script */ - ShellCommandExecutor shexec = null; + /** Pattern used for searching in the output of the node health script. */ + private static final String ERROR_PATTERN = "ERROR"; - /** Pattern used for searching in the output of the node health script */ - static private final String ERROR_PATTERN = "ERROR"; + /** Time out error message. */ + static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = + "Node health script timed out"; - /** Time out error message */ - public static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out"; - - private boolean isHealthy; - - private String healthReport; - - private long lastReportedTime; + NodeHealthScriptRunner(String scriptName, long checkInterval, long timeout, + String[] scriptArgs) { + super(NodeHealthScriptRunner.class.getName(), checkInterval); + this.nodeHealthScript = scriptName; + this.scriptTimeout = timeout; + setTimerTask(new NodeHealthMonitorExecutor(scriptArgs)); + } + + public static NodeHealthScriptRunner newInstance(Configuration conf) { + String nodeHealthScript = + conf.get(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH); + if(!shouldRun(nodeHealthScript)) { + LOG.info("Node Manager health check script is not available " + + "or doesn't have execute permission, so not " + + "starting the NodeHealthScriptRunner."); + if (nodeHealthScript != null && nodeHealthScript.trim().isEmpty()) { + LOG.warn("Location of the node health check script: {}", + nodeHealthScript); + } + return null; + } + long nmCheckintervalTime = conf.getLong( + YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS); + long scriptTimeout = conf.getLong( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS); + String[] scriptArgs = conf.getStrings( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS, new String[] {}); + return new NodeHealthScriptRunner(nodeHealthScript, + nmCheckintervalTime, scriptTimeout, scriptArgs); + } - private TimerTask timer; - private enum HealthCheckerExitStatus { SUCCESS, TIMED_OUT, @@ -84,13 +104,11 @@ /** * Class which is used by the {@link Timer} class to periodically execute the * node health script. - * */ private class NodeHealthMonitorExecutor extends TimerTask { + private String exceptionStackTrace = ""; - String exceptionStackTrace = ""; - - public NodeHealthMonitorExecutor(String[] args) { + NodeHealthMonitorExecutor(String[] args) { ArrayList execScript = new ArrayList(); execScript.add(nodeHealthScript); if (args != null) { @@ -134,27 +152,29 @@ public void run() { /** * Method which is used to parse output from the node health monitor and * send to the report address. - * + * * The timed out script or script which causes IOException output is * ignored. - * + * * The node is marked unhealthy if *
    *
  1. The node health script times out
  2. - *
  3. The node health scripts output has a line which begins with ERROR
  4. + *
  5. The node health scripts output has a line which begins + * with ERROR
  6. *
  7. An exception is thrown while executing the script
  8. *
* If the script throws {@link IOException} or {@link ExitCodeException} the * output is ignored and node is left remaining healthy, as script might * have syntax error. - * + * * @param status */ void reportHealthStatus(HealthCheckerExitStatus status) { - long now = System.currentTimeMillis(); switch (status) { case SUCCESS: - setHealthStatus(true, "", now); + case FAILED_WITH_EXIT_CODE: + // see Javadoc above - we don't report bad health intentionally + setHealthStatus(true, ""); break; case TIMED_OUT: setHealthStatus(false, NODE_HEALTH_SCRIPT_TIMED_OUT_MSG); @@ -162,21 +182,19 @@ void reportHealthStatus(HealthCheckerExitStatus status) { case FAILED_WITH_EXCEPTION: setHealthStatus(false, exceptionStackTrace); break; - case FAILED_WITH_EXIT_CODE: - // see Javadoc above - we don't report bad health intentionally - setHealthStatus(true, "", now); - break; case FAILED: setHealthStatus(false, shexec.getOutput()); break; + default: + LOG.warn("Unknown HealthCheckerExitStatus - ignored."); + break; } } /** * Method to check if the output string has line which begins with ERROR. - * - * @param output - * string + * + * @param output the output of the node health script to process * @return true if output string has error pattern in it. */ private boolean hasErrors(String output) { @@ -190,150 +208,34 @@ private boolean hasErrors(String output) { } } - public NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout, - String[] scriptArgs) { - super(NodeHealthScriptRunner.class.getName()); - this.lastReportedTime = System.currentTimeMillis(); - this.isHealthy = true; - this.healthReport = ""; - this.nodeHealthScript = scriptName; - this.intervalTime = chkInterval; - this.scriptTimeout = timeout; - this.timer = new NodeHealthMonitorExecutor(scriptArgs); - } - - /* - * Method which initializes the values for the script path and interval time. - */ - @Override - protected void serviceInit(Configuration conf) throws Exception { - super.serviceInit(conf); - } - - /** - * Method used to start the Node health monitoring. - * - */ - @Override - protected void serviceStart() throws Exception { - nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true); - // Start the timer task immediately and - // then periodically at interval time. - nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime); - super.serviceStart(); - } - - /** - * Method used to terminate the node health monitoring service. - * - */ @Override - protected void serviceStop() { - if (nodeHealthScriptScheduler != null) { - nodeHealthScriptScheduler.cancel(); - } + public void serviceStop() throws Exception { if (shexec != null) { Process p = shexec.getProcess(); if (p != null) { p.destroy(); } } + super.serviceStop(); } /** - * Gets the if the node is healthy or not - * - * @return true if node is healthy - */ - public boolean isHealthy() { - return isHealthy; - } - - /** - * Sets if the node is healthy or not considering disks' health also. - * - * @param isHealthy - * if or not node is healthy - */ - private synchronized void setHealthy(boolean isHealthy) { - this.isHealthy = isHealthy; - } - - /** - * Returns output from health script. if node is healthy then an empty string - * is returned. - * - * @return output from health script - */ - public String getHealthReport() { - return healthReport; - } - - /** - * Sets the health report from the node health script. Also set the disks' - * health info obtained from DiskHealthCheckerService. + * Method used to determine whether the {@link NodeHealthScriptRunner} + * should be started or not.

+ * Returns true if following conditions are met: * - * @param healthReport - */ - private synchronized void setHealthReport(String healthReport) { - this.healthReport = healthReport; - } - - /** - * Returns time stamp when node health script was last run. - * - * @return timestamp when node health script was last run - */ - public long getLastReportedTime() { - return lastReportedTime; - } - - /** - * Sets the last run time of the node health script. - * - * @param lastReportedTime - */ - private synchronized void setLastReportedTime(long lastReportedTime) { - this.lastReportedTime = lastReportedTime; - } - - /** - * Method used to determine if or not node health monitoring service should be - * started or not. Returns true if following conditions are met: - * *

    *
  1. Path to Node health check script is not empty
  2. *
  3. Node health check script file exists
  4. *
- * + * * @return true if node health monitoring service can be started. */ - public static boolean shouldRun(String healthScript) { + static boolean shouldRun(String healthScript) { if (healthScript == null || healthScript.trim().isEmpty()) { return false; } File f = new File(healthScript); return f.exists() && FileUtil.canExecute(f); } - - private synchronized void setHealthStatus(boolean isHealthy, String output) { - LOG.info("health status being set as " + output); - this.setHealthy(isHealthy); - this.setHealthReport(output); - } - - private synchronized void setHealthStatus(boolean isHealthy, String output, - long time) { - LOG.info("health status being set as " + output); - this.setHealthStatus(isHealthy, output); - this.setLastReportedTime(time); - } - - /** - * Used only by tests to access the timer task directly - * @return the timer task - */ - public TimerTask getTimerTask() { - return timer; - } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java new file mode 100644 index 0000000000000000000000000000000000000000..a37ce01d3fb5947765fd968188d3fbbbf97dade5 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java @@ -0,0 +1,144 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.service.AbstractService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Timer; +import java.util.TimerTask; + +/** + * A {@link HealthReporter} skeleton for regularly checking a specific + * {@link TimerTask} and obtaining information about it. + */ +public abstract class TimedHealthReporterService extends AbstractService + implements HealthReporter { + + private static final Logger LOG = + LoggerFactory.getLogger(TimedHealthReporterService.class); + + private boolean isHealthy; + private String healthReport; + private long lastReportedTime; + + private Timer timer; + private TimerTask task; + private long intervalMs; + + TimedHealthReporterService(String name, long intervalMs) { + super(name); + this.isHealthy = true; + this.healthReport = ""; + this.lastReportedTime = System.currentTimeMillis(); + this.intervalMs = intervalMs; + } + + @VisibleForTesting + long getIntervalMs() { + return intervalMs; + } + + @VisibleForTesting + void setTimerTask(TimerTask timerTask) { + task = timerTask; + } + + @VisibleForTesting + TimerTask getTimerTask() { + return task; + } + + /** + * Method used to start the health monitoring. + */ + @Override + public void serviceStart() throws Exception { + if (task == null) { + throw new Exception("Health reporting task hasn't been set!"); + } + timer = new Timer("HealthReporterService-Timer", true); + timer.scheduleAtFixedRate(task, 0, intervalMs); + super.serviceStart(); + } + + /** + * Method used to terminate the health monitoring service. + */ + @Override + protected void serviceStop() throws Exception { + if (timer != null) { + timer.cancel(); + } + super.serviceStop(); + } + + @Override + public boolean isHealthy() { + return isHealthy; + } + + /** + * Sets if the node is healthy or not. + * + * @param healthy + * whether the node is healthy + */ + protected synchronized void setHealthy(boolean healthy) { + this.isHealthy = healthy; + } + + @Override + public String getHealthReport() { + return healthReport; + } + + /** + * Sets the health report from the node health check. Also set the disks' + * health info obtained from DiskHealthCheckerService. + * + * @param report + */ + private synchronized void setHealthReport(String report) { + this.healthReport = report; + } + + @Override + public long getLastHealthReportTime() { + return lastReportedTime; + } + + /** + * Sets the last run time of the node health check. + * + * @param lastReportedTime + */ + private synchronized void setLastReportedTime(long lastReportedTime) { + this.lastReportedTime = lastReportedTime; + } + + synchronized void setHealthStatus(boolean healthy, String output) { + LOG.info("Health status being set as: \"" + output + "\"."); + this.setHealthy(healthy); + this.setHealthReport(output); + this.setLastReportedTime(System.currentTimeMillis()); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java index 2e80259d210cc33cc8b251d1af2d223e64696feb..81d5fd2382309d7befcae4ff06d8a533dfc76e94 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager; import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,6 +39,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java index 13b3ee91bdc4879bbf56abaf7ec3836c197f95f8..fcb76f9e75f27777fbbcef6d20e715ec4d481e48 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.junit.Assert; import org.junit.Before; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java index 54e090a29e2ac389e548bb0649bd90fe318f9c88..b1fc2f1aa26175d8b7c60a95c0484e1ba13940a8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java @@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; @@ -102,8 +103,8 @@ public int getHttpPort() { DeletionService del = new DeletionService(exec); Dispatcher dispatcher = new AsyncDispatcher(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); NodeManagerMetrics metrics = NodeManagerMetrics.create(); NodeStatusUpdater nodeStatusUpdater = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java index fbd3646940dccfca172991f7c97d6299719506a0..260c3c4144cb297f2ab4b1975a88ebe5e4310c08 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java @@ -64,6 +64,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.deletion.task.FileDeletionMatcher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java index 25cca876ac67d2991fac8f9fd78ba11ab3c46524..9eae82a93226d47e1843f4f046cabd489d759f54 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java @@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java index 25dbc1dd2ea82cad0e44fc5961bfcae367ec50d4..9a0213d87cf7aa6273d655a0133c2a79831767fe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java @@ -66,6 +66,7 @@ import org.apache.hadoop.yarn.security.NMTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.ConverterUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 1b21b93654303e4c34187e3faeb494382e5ce388..70317c73f329e998acde41923878c76738825890 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -107,6 +107,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java index 325d60c59befdfe45d9d995f83b2da9cfcea631e..072f4432c62778821b59dd9e698ae92b9ab337bb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java @@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java index a86ca3e821184cfb3dc7e0af2369e93ebac478a6..e3dce3b5b512b8ddeaf243aa945c2b14865b7c48 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java @@ -50,6 +50,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java index 15c1cac9cb83d9fe59f6b7717f5d662c42cb8baf..b0756c40a31c8bd7c9642fba01e03db04618bcd1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java @@ -75,7 +75,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; @@ -218,8 +218,7 @@ public void setup() throws IOException { delSrvc.init(conf); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); nodeHealthChecker.init(conf); containerManager = createContainerManager(delSrvc); ((NMContext)context).setContainerManager(containerManager); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index e920105abf9aa1cc514ebb47682e642749cd02f0..00ce28b1720203f5c76aac931db182b0990e6d50 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -85,8 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -106,7 +105,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler.ContainerScheduler; - import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.TestNodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService; @@ -157,8 +155,7 @@ public void setup() throws IOException { delSrvc.init(conf); exec = createContainerExecutor(); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); nodeHealthChecker.init(conf); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java index 28f917fd8422bee4f2c94ba74a412889ea5c8f34..784a73c44d9c546eba56e643a9bf3c8b026d2faa 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java @@ -30,7 +30,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java new file mode 100644 index 0000000000000000000000000000000000000000..541b27c23141492a1db9f5b5010975cad5ad76fd --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java @@ -0,0 +1,162 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.collect.ImmutableList; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.List; +import java.util.TimerTask; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + +/** + * Tests for {@link DockerHealthCheckerService} class. + * + * The {@link #testHealthChecker} function tests whether the pid file + * of the Docker daemon exists in different scenarios. + */ +public class TestDockerHealthCheckerService { + + private static final File TEST_ROOT_DIR = new File("target", + TestDockerHealthCheckerService.class.getName() + + "-localDir").getAbsoluteFile(); + + private static final File PID_FILE = new File(TEST_ROOT_DIR, + DockerHealthCheckerService.PID_FILE_NAME); + private static final File OTHER_PID_FILE = new File(TEST_ROOT_DIR, + DockerHealthCheckerService.PID_FILE_NAME + "2"); + private static final File THIRD_PID_FILE = new File(TEST_ROOT_DIR, + DockerHealthCheckerService.PID_FILE_NAME + "3"); + private static final File JSON_CONF = new File(TEST_ROOT_DIR, "daemon.json"); + + @Before + public void setup() { + TEST_ROOT_DIR.mkdirs(); + } + + @After + public void tearDown() throws Exception { + if (TEST_ROOT_DIR.exists()) { + FileContext.getLocalFSFileContext().delete( + new Path(TEST_ROOT_DIR.getAbsolutePath()), true); + } + } + + private void createPidFile(File pidFile) throws IOException { + FileUtils.writeStringToFile(pidFile, "", Charset.defaultCharset()); + } + + private void createDaemonConfJson(String content) + throws IOException { + FileUtils.cleanDirectory(TEST_ROOT_DIR); + FileUtils.writeStringToFile(JSON_CONF, content, Charset.defaultCharset()); + } + + @Test + public void checkNewInstanceCreation() { + Configuration conf = new Configuration(); + DockerHealthCheckerService service = + DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNull(); + + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_ENABLE, "true"); + service = DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + assertThat(service.getStartupMode()).isFalse(); + assertThat(service.getIntervalMs()).isEqualTo(600000); + + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_STARTUP, "true"); + service = DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + assertThat(service.getStartupMode()).isTrue(); + assertThat(service.getIntervalMs()).isEqualTo(600000); + + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS, "16500"); + service = DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + assertThat(service.getStartupMode()).isTrue(); + assertThat(service.getIntervalMs()).isEqualTo(16500); + } + + @Test + public void testHealthChecker() throws Exception { + Configuration conf = new Configuration(); + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_ENABLE, "true"); + + DockerHealthCheckerService service = + DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + service.serviceInit(conf); + + TimerTask task = service.getTimerTask(); + assertThat(task).isInstanceOf( + DockerHealthCheckerService.DockerDaemonMonitorExecutor.class); + DockerHealthCheckerService.DockerDaemonMonitorExecutor executor = + spy((DockerHealthCheckerService.DockerDaemonMonitorExecutor) task); + + // Initialize spy object to point to the dummy files + List files = ImmutableList.of(PID_FILE, OTHER_PID_FILE); + when(executor.getPossiblePidFileLocations()).thenReturn(files); + when(executor.getDockerDaemonConf()).thenReturn(JSON_CONF); + service.setTimerTask(executor); + + // no pid or json file case + executor.run(); + assertThat(service.isHealthy()).isFalse(); + assertThat(service.getHealthReport()) + .isEqualTo(DockerHealthCheckerService.NO_PID_FILE); + + // existing json file with non-existing configured pid file location + createDaemonConfJson("{\"prop1\":\"value1\",\"pidfile\":\"" + + THIRD_PID_FILE.getPath() + "\"}"); + executor.run(); + assertThat(service.isHealthy()).isFalse(); + + // existing json file with existing configured pid file location + createDaemonConfJson("{\"prop1\":\"value1\",\"pidfile\":\"" + + THIRD_PID_FILE.getPath() + "\"}"); + createPidFile(THIRD_PID_FILE); + executor.run(); + assertThat(service.isHealthy()).isTrue(); + + // existing json file without configured pid file location + createDaemonConfJson("{\"prop1\":\"value1\",\"prop2\":\"value2\"}"); + executor.run(); + assertThat(service.isHealthy()).isFalse(); + + // existing pid file in different default locations + FileUtils.cleanDirectory(TEST_ROOT_DIR); + createPidFile(PID_FILE); + executor.run(); + assertThat(service.isHealthy()).isTrue(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthCheckerService.java similarity index 66% rename from hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthCheckerService.java index 8083a56377386f2683c83b0390bd3824f6301435..25e0a08328183f410fa2daed762963069bc48310 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthCheckerService.java @@ -16,12 +16,15 @@ * limitations under the License. */ -package org.apache.hadoop.yarn.server.nodemanager; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; + +import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,7 +34,6 @@ import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.factories.RecordFactory; @@ -42,49 +44,56 @@ import org.junit.Before; import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.fail; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.spy; -public class TestNodeHealthService { +/** + * Test class for {@link NodeHealthCheckerService}. + */ +public class TestNodeHealthCheckerService { - private static volatile Logger LOG = - LoggerFactory.getLogger(TestNodeHealthService.class); + private static final Logger LOG = + LoggerFactory.getLogger(TestNodeHealthCheckerService.class); - protected static File testRootDir = new File("target", - TestNodeHealthService.class.getName() + "-localDir").getAbsoluteFile(); + private static final File TEST_ROOT_DIR = new File("target", + TestNodeHealthCheckerService.class.getName() + "-localDir") + .getAbsoluteFile(); - final static File nodeHealthConfigFile = new File(testRootDir, + private static final File NODE_HEALTH_CONFIG_FILE = new File(TEST_ROOT_DIR, "modified-mapred-site.xml"); - private File nodeHealthscriptFile = new File(testRootDir, + private File nodeHealthscriptFile = new File(TEST_ROOT_DIR, Shell.appendScriptExtension("failingscript")); @Before public void setup() { - testRootDir.mkdirs(); + TEST_ROOT_DIR.mkdirs(); } @After public void tearDown() throws Exception { - if (testRootDir.exists()) { + if (TEST_ROOT_DIR.exists()) { FileContext.getLocalFSFileContext().delete( - new Path(testRootDir.getAbsolutePath()), true); + new Path(TEST_ROOT_DIR.getAbsolutePath()), true); } } - - private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) - throws IOException { + + private void writeNodeHealthScriptFile() throws IOException { PrintWriter pw = null; try { FileUtil.setWritable(nodeHealthscriptFile, true); FileUtil.setReadable(nodeHealthscriptFile, true); pw = new PrintWriter(new FileOutputStream(nodeHealthscriptFile)); - pw.println(scriptStr); + pw.println(""); pw.flush(); } finally { - pw.close(); + if (pw != null) { + pw.close(); + } } - FileUtil.setExecutable(nodeHealthscriptFile, setExecutable); + FileUtil.setExecutable(nodeHealthscriptFile, true); } private Configuration getConfForNodeHealthScript() { @@ -110,15 +119,20 @@ public void testNodeHealthService() throws Exception { NodeHealthStatus healthStatus = factory.newRecordInstance(NodeHealthStatus.class); Configuration conf = getConfForNodeHealthScript(); - conf.writeXml(new FileOutputStream(nodeHealthConfigFile)); - conf.addResource(nodeHealthConfigFile.getName()); - writeNodeHealthScriptFile("", true); + conf.writeXml(new FileOutputStream(NODE_HEALTH_CONFIG_FILE)); + conf.addResource(NODE_HEALTH_CONFIG_FILE.getName()); + writeNodeHealthScriptFile(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthScriptRunner nodeHealthScriptRunner = - spy(NodeManager.getNodeHealthScriptRunner(conf)); - NodeHealthCheckerService nodeHealthChecker = new NodeHealthCheckerService( - nodeHealthScriptRunner, dirsHandler); + NodeHealthScriptRunner.newInstance(conf); + if (nodeHealthScriptRunner == null) { + fail("Should have created NodeHealthScriptRunner instance"); + } + nodeHealthScriptRunner = spy(nodeHealthScriptRunner); + NodeHealthCheckerService nodeHealthChecker = + new NodeHealthCheckerService(dirsHandler); + nodeHealthChecker.addHealthReporter(nodeHealthScriptRunner); nodeHealthChecker.init(conf); doReturn(true).when(nodeHealthScriptRunner).isHealthy(); @@ -133,7 +147,7 @@ public void testNodeHealthService() throws Exception { Assert.assertTrue("Node health status reported unhealthy", healthStatus .getHealthReport().equals(nodeHealthChecker.getHealthReport())); - doReturn(false).when(nodeHealthScriptRunner).isHealthy(); + doReturn(false).when(nodeHealthScriptRunner).isHealthy(); // update health status setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(), nodeHealthChecker.getHealthReport(), @@ -174,4 +188,47 @@ public void testNodeHealthService() throws Exception { .getDisksHealthReport(false)) ))); } + + private abstract class HealthReporterService extends AbstractService + implements HealthReporter { + HealthReporterService() { + super(HealthReporterService.class.getName()); + } + } + + @Test + public void testCustomHealthReporter() throws Exception { + String healthReport = "dummy health report"; + HealthReporterService customHealthReporter = new HealthReporterService() { + private int counter = 0; + + @Override + public boolean isHealthy() { + return counter++ % 2 == 0; + } + + @Override + public String getHealthReport() { + return healthReport; + } + + @Override + public long getLastHealthReportTime() { + return Long.MAX_VALUE; + } + }; + + Configuration conf = new Configuration(); + LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); + NodeHealthCheckerService nodeHealthChecker = + new NodeHealthCheckerService(dirsHandler); + nodeHealthChecker.addHealthReporter(customHealthReporter); + nodeHealthChecker.init(conf); + + assertThat(nodeHealthChecker.isHealthy()).isTrue(); + assertThat(nodeHealthChecker.isHealthy()).isFalse(); + assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(healthReport); + assertThat(nodeHealthChecker.getLastHealthReportTime()) + .isEqualTo(Long.MAX_VALUE); + } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java similarity index 90% rename from hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java index 2748c0b581a8849c3715cc66c8d758f871a8eda4..48f74fd31236154c1920055c8fd5226fbefb1699 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.util; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.FileOutputStream; @@ -28,14 +28,18 @@ import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Shell; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +/** + * Test class for {@link NodeHealthScriptRunner}. + */ public class TestNodeHealthScriptRunner { - protected static File testRootDir = new File("target", + private static File testRootDir = new File("target", TestNodeHealthScriptRunner.class.getName() + "-localDir").getAbsoluteFile(); @@ -55,8 +59,8 @@ public void tearDown() throws Exception { } } - private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) - throws IOException { + private void writeNodeHealthScriptFile(String scriptStr, + boolean setExecutable) throws IOException { PrintWriter pw = null; try { FileUtil.setWritable(nodeHealthscriptFile, true); @@ -92,8 +96,9 @@ public void testNodeHealthScript() throws Exception { String errorScript = "echo ERROR\n echo \"Tracker not healthy\""; String normalScript = "echo \"I am all fine\""; String timeOutScript = - Shell.WINDOWS ? "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" - : "sleep 4\necho \"I am fine\""; + Shell.WINDOWS ? + "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" + : "sleep 4\necho \"I am fine\""; String exitCodeScript = "exit 127"; Configuration conf = new Configuration(); @@ -118,7 +123,7 @@ public void testNodeHealthScript() throws Exception { nodeHealthScriptRunner.isHealthy()); Assert.assertTrue( nodeHealthScriptRunner.getHealthReport().contains("ERROR")); - + // Healthy script. writeNodeHealthScriptFile(normalScript, true); timerTask.run(); @@ -130,7 +135,7 @@ public void testNodeHealthScript() throws Exception { writeNodeHealthScriptFile(timeOutScript, true); timerTask.run(); Assert.assertFalse("Node health status reported healthy even after timeout", - nodeHealthScriptRunner.isHealthy()); + nodeHealthScriptRunner.isHealthy()); Assert.assertEquals( NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG, nodeHealthScriptRunner.getHealthReport()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java index ece1af4a260da0861e9f61a855fef30e0ebfa299..a6f7cec5d8a51251d92d8bf1d886a1a3a3fd69c9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java @@ -43,7 +43,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -57,7 +56,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -80,9 +79,8 @@ public class TestContainerLogsPage { private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } @Test(timeout=30000) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java index 0d618fde10f075e89f7b882b81ed1fd5b7238aab..914bda7018b4e15a977d37036fc06c46c7c4ffd1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java @@ -20,11 +20,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -105,8 +104,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService( - conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); @@ -120,12 +118,9 @@ public boolean isPmemCheckEnabled() { } } - private NodeHealthCheckerService createNodeHealthCheckerService( - Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner( - conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java index 0a71a9179bbb0a567c29b76441fbaf64e4652a2e..232cc127044aecd3c31ea02bb0b725241393c627 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java @@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -42,7 +41,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -54,7 +53,6 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; -import org.apache.hadoop.yarn.util.ConverterUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -79,10 +77,9 @@ public void tearDown() { FileUtil.fullyDelete(testLogDir); } - private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } private int startNMWebAppServer(String webAddr) { @@ -113,7 +110,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); @@ -176,7 +173,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java index ad17ae81322d11c7d45e1b2f5333b7bd129b4ad6..f591bad00d8e6e7148e68d075f8b7ffff933e315 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java @@ -47,7 +47,7 @@ import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; @@ -141,8 +141,8 @@ protected void configureServlets() { conf.set(YarnConfiguration.YARN_LOG_SERVER_WEBSERVICE_URL, LOGSERVICEWSADDR); dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); aclsManager = new ApplicationACLsManager(conf); nmContext = new NodeManager.NMContext(null, null, dirsHandler, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java index 3533d16849d36c7b3d886b9c828c6ece8a49df7f..041036750feab6a3d82b746001588c4125352735 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java @@ -47,13 +47,13 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AppsInfo; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; @@ -105,7 +105,7 @@ protected void configureServlets() { conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java index 4ee63db81774f2ba9dd63b612d78161722d97780..cd476c25338396de30b62bea6560e5ddc1abca27 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java @@ -44,11 +44,11 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices; import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; @@ -125,7 +125,7 @@ public boolean isPmemCheckEnabled() { conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java index a99ce2803813016e50ab6af77fd01e8b3522f2c6..aacdf4c7138cfd839b03eb0616e56331dd3dc011 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java @@ -28,7 +28,6 @@ import java.io.File; import java.io.IOException; import java.io.StringReader; -import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -48,16 +47,15 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; -import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; import org.apache.hadoop.yarn.webapp.GuiceServletConfig; import org.apache.hadoop.yarn.webapp.JerseyTestBase; @@ -132,7 +130,7 @@ public boolean isPmemCheckEnabled() { conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java index ec7d62c803c61e0e3ffa2febcbfb5178277aa5b4..d4180e482518614ef24e9ba6dd82fa20da985919 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java @@ -26,13 +26,12 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.http.JettyUtils; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.junit.After; import org.junit.Before; @@ -54,12 +53,9 @@ private WebServer server; private int port; - private NodeHealthCheckerService createNodeHealthCheckerService( - Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager - .getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } private int startNMWebAppServer(String webAddr) { @@ -90,7 +86,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, TESTLOGDIR.getAbsolutePath()); - healthChecker = createNodeHealthCheckerService(conf); + healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java index 68d97ee32ac533142df7f6c08ecaf855b09b992d..fdcefcbc350ddeca7733b8de1f7de599f3091042 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java @@ -70,7 +70,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md index e4ed57f5cb4b30e59e71278ca0d2185baa3dfb71..131f052000575c5c6e366e7063a4ffc48db71334 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md @@ -42,6 +42,17 @@ The following configuration parameters can be used to modify the disk checks: | `yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage` | Float between 0-100 | The maximum percentage of disk space that may be utilized before a disk is marked as unhealthy by the disk checker service. This check is run for every disk used by the NodeManager. The default value is 90 i.e. 90% of the disk can be used. | | `yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb` | Integer | The minimum amount of free space that must be available on the disk for the disk checker service to mark the disk as healthy. This check is run for every disk used by the NodeManager. The default value is 0 i.e. the entire disk can be used. | + +### Docker health checker + +If the Docker on Yarn feature is enabled, you can enabled the Docker health checker service. The Docker health checker checks the availability of the Docker daemon in the host, marking the node as unhealthy if the daemon is not responding. + +| Configuration Name | Allowed Values | Description | +|:---- |:---- |:---- | +| `yarn.nodemanager.docker-health-checker.enable` | true, false | Enable or disable the Docker health checker service. Default is false | +| `yarn.nodemanager.docker-health-checker.startup` | true, false | The NodeManager will fail to come up if the Docker daemon is not responding during startup. Default is false | +| `yarn.nodemanager.docker-health-checker.interval-ms` | Positive integer | The interval, in milliseconds, at which the Docker checker should run; the default value is 10 minutes | + ###External Health Script Users may specify their own health checker script that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that: -- 2.21.0