From 6c025ac5c89f2d4a6462b25456a7e3c0f7529a36 Mon Sep 17 00:00:00 2001 From: Adam Antal Date: Thu, 7 Nov 2019 19:22:09 +0100 Subject: [PATCH] YARN-9923. Detect missing Docker binary or not running Docker daemon --- .../dev-support/findbugs-exclude.xml | 16 +- .../hadoop/yarn/conf/YarnConfiguration.java | 18 ++ .../src/main/resources/yarn-default.xml | 18 ++ .../nodemanager/LocalDirsHandlerService.java | 21 +- .../nodemanager/NodeHealthCheckerService.java | 123 ----------- .../yarn/server/nodemanager/NodeManager.java | 28 +-- .../nodemanager/NodeStatusUpdaterImpl.java | 1 + .../health/DockerHealthCheckerService.java | 165 +++++++++++++++ .../nodemanager/health/HealthReporter.java | 60 ++++++ .../health/NodeHealthCheckerService.java | 134 ++++++++++++ .../health}/NodeHealthScriptRunner.java | 198 +++++------------- .../health/TimedHealthReporterService.java | 150 +++++++++++++ .../nodemanager/MockNodeStatusUpdater.java | 2 + .../nodemanager/NodeManagerTestBase.java | 1 + .../server/nodemanager/TestEventFlow.java | 5 +- .../nodemanager/TestNodeManagerReboot.java | 1 + .../nodemanager/TestNodeManagerResync.java | 1 + .../nodemanager/TestNodeManagerShutdown.java | 1 + .../nodemanager/TestNodeStatusUpdater.java | 1 + .../TestNodeStatusUpdaterForAttributes.java | 1 + .../TestNodeStatusUpdaterForLabels.java | 1 + .../BaseContainerManagerTest.java | 5 +- .../TestContainerManagerRecovery.java | 7 +- .../TestResourcePluginManager.java | 2 +- .../TestDockerHealthCheckerService.java | 125 +++++++++++ .../health}/TestNodeHealthScriptRunner.java | 20 +- .../{ => health}/TestNodeHealthService.java | 31 +-- .../webapp/TestContainerLogsPage.java | 6 +- .../webapp/TestNMContainerWebSocket.java | 13 +- .../nodemanager/webapp/TestNMWebServer.java | 13 +- .../nodemanager/webapp/TestNMWebServices.java | 6 +- .../webapp/TestNMWebServicesApps.java | 4 +- .../webapp/TestNMWebServicesAuxServices.java | 4 +- .../webapp/TestNMWebServicesContainers.java | 6 +- .../nodemanager/webapp/TestNMWebTerminal.java | 12 +- .../hadoop/yarn/server/MiniYARNCluster.java | 2 +- .../src/site/markdown/NodeManager.md | 11 + 37 files changed, 841 insertions(+), 372 deletions(-) delete mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java rename {hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util => hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health}/NodeHealthScriptRunner.java (60%) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java rename {hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util => hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health}/TestNodeHealthScriptRunner.java (90%) rename hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/{ => health}/TestNodeHealthService.java (88%) diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index e3149f079c6..111d4e32952 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -662,7 +662,7 @@ - + @@ -701,4 +701,18 @@ + + + + + + + + + + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index d9840ac9999..0185541ea61 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1977,6 +1977,24 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_HEALTH_CHECK_SCRIPT_OPTS = NM_PREFIX + "health-checker.script.opts"; + public static final String NM_DOCKER_HEALTH_CHECKER_PREFIX = + NM_PREFIX + "docker-health-checker."; + + public static final String NM_DOCKER_HEALTH_CHECKER_ENABLE = + NM_DOCKER_HEALTH_CHECKER_PREFIX + "enable"; + public static final boolean DEFAULT_NM_DOCKER_HEALTH_CHECKER_ENABLE = + false; + + public static final String NM_DOCKER_HEALTH_CHECKER_STARTUP = + NM_DOCKER_HEALTH_CHECKER_PREFIX + "startup"; + public static final boolean DEFAULT_NM_DOCKER_HEALTH_CHECKER_STARTUP = + false; + + public static final String NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS = + NM_DOCKER_HEALTH_CHECKER_PREFIX + "interval-ms"; + public static final long DEFAULT_NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS = + DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS; + /** The JVM options used on forking ContainerLocalizer process by container executor. */ public static final String NM_CONTAINER_LOCALIZER_JAVA_OPTS_KEY = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 907f290afad..3434ef8f025 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1619,6 +1619,24 @@ + + + yarn.nodemanager.docker-health-checker.enable + false + + + + + yarn.nodemanager.docker-health-checker.startup + false + + + + + yarn.nodemanager.docker-health-checker.interval-ms + 600000 + + Frequency of running disk health checker code. yarn.nodemanager.disk-health-checker.interval-ms diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 8d060b01adf..fee88478e01 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -29,9 +29,11 @@ import java.util.TimerTask; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskValidator; import org.apache.hadoop.util.DiskValidatorFactory; +import org.apache.hadoop.yarn.server.nodemanager.health.HealthReporter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,7 +44,6 @@ import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; @@ -54,7 +55,8 @@ * directories of a node. This specifically manages nodemanager-local-dirs and * nodemanager-log-dirs by periodically checking their health. */ -public class LocalDirsHandlerService extends AbstractService { +public class LocalDirsHandlerService extends AbstractService + implements HealthReporter { private static final Logger LOG = LoggerFactory.getLogger(LocalDirsHandlerService.class); @@ -426,6 +428,11 @@ public String getDisksHealthReport(boolean listGoodDirs) { } + @Override + public String getHealthReport() { + return getDisksHealthReport(false); + } + /** * The minimum fraction of number of disks needed to be healthy for a node to * be considered healthy in terms of disks is configured using @@ -457,10 +464,20 @@ public boolean areDisksHealthy() { return true; } + @Override + public boolean isHealthy() { + return areDisksHealthy(); + } + public long getLastDisksCheckTime() { return lastDisksCheckTime; } + @Override + public long getLastHealthReportTime() { + return getLastDisksCheckTime(); + } + public boolean isGoodLocalDir(String path) { return isInGoodDirs(getLocalDirs(), path); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java deleted file mode 100644 index 7e2fc7e022d..00000000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeHealthCheckerService.java +++ /dev/null @@ -1,123 +0,0 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.hadoop.yarn.server.nodemanager; - -import com.google.common.base.Joiner; -import com.google.common.base.Strings; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.service.CompositeService; -import org.apache.hadoop.util.NodeHealthScriptRunner; - -import java.util.Arrays; -import java.util.Collections; - -/** - * The class which provides functionality of checking the health of the node and - * reporting back to the service for which the health checker has been asked to - * report. - */ -public class NodeHealthCheckerService extends CompositeService { - - private NodeHealthScriptRunner nodeHealthScriptRunner; - private LocalDirsHandlerService dirsHandler; - private Exception nodeHealthException; - private long nodeHealthExceptionReportTime; - - static final String SEPARATOR = ";"; - - public NodeHealthCheckerService(NodeHealthScriptRunner scriptRunner, - LocalDirsHandlerService dirHandlerService) { - super(NodeHealthCheckerService.class.getName()); - nodeHealthScriptRunner = scriptRunner; - dirsHandler = dirHandlerService; - nodeHealthException = null; - nodeHealthExceptionReportTime = 0; - } - - @Override - protected void serviceInit(Configuration conf) throws Exception { - if (nodeHealthScriptRunner != null) { - addService(nodeHealthScriptRunner); - } - addService(dirsHandler); - super.serviceInit(conf); - } - - /** - * @return the reporting string of health of the node - */ - String getHealthReport() { - String scriptReport = Strings.emptyToNull( - nodeHealthScriptRunner == null ? null : - nodeHealthScriptRunner.getHealthReport()); - String discReport = - Strings.emptyToNull( - dirsHandler.getDisksHealthReport(false)); - String exceptionReport = Strings.emptyToNull( - nodeHealthException == null ? null : - nodeHealthException.getMessage()); - - return Joiner.on(SEPARATOR).skipNulls() - .join(scriptReport, discReport, exceptionReport); - } - - /** - * @return true if the node is healthy - */ - boolean isHealthy() { - boolean scriptHealthy = nodeHealthScriptRunner == null || - nodeHealthScriptRunner.isHealthy(); - return nodeHealthException == null && - scriptHealthy && dirsHandler.areDisksHealthy(); - } - - /** - * @return when the last time the node health status is reported - */ - long getLastHealthReportTime() { - return Collections.max(Arrays.asList( - dirsHandler.getLastDisksCheckTime(), - nodeHealthScriptRunner == null ? 0 : - nodeHealthScriptRunner.getLastReportedTime(), - nodeHealthExceptionReportTime)); - } - - /** - * @return the disk handler - */ - public LocalDirsHandlerService getDiskHandler() { - return dirsHandler; - } - - /** - * @return the node health script runner - */ - NodeHealthScriptRunner getNodeHealthScriptRunner() { - return nodeHealthScriptRunner; - } - - /** - * Report an exception to mark the node as unhealthy. - * @param ex the exception that makes the node unhealthy - */ - void reportException(Exception ex) { - nodeHealthException = ex; - nodeHealthExceptionReportTime = System.currentTimeMillis(); - } -} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 4bbae340a77..5f48b3acbee 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -33,7 +33,7 @@ import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.JvmPauseMonitor; -import org.apache.hadoop.util.NodeHealthScriptRunner; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.ShutdownHookManager; @@ -347,27 +347,6 @@ private void recoverTokens(NMTokenSecretManagerInNM nmTokenSecretManager, } } - public static NodeHealthScriptRunner getNodeHealthScriptRunner(Configuration conf) { - String nodeHealthScript = - conf.get(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH); - if(!NodeHealthScriptRunner.shouldRun(nodeHealthScript)) { - LOG.info("Node Manager health check script is not available " - + "or doesn't have execute permission, so not " - + "starting the node health script runner."); - return null; - } - long nmCheckintervalTime = conf.getLong( - YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, - YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS); - long scriptTimeout = conf.getLong( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, - YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS); - String[] scriptArgs = conf.getStrings( - YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS, new String[] {}); - return new NodeHealthScriptRunner(nodeHealthScript, - nmCheckintervalTime, scriptTimeout, scriptArgs); - } - @VisibleForTesting protected ResourcePluginManager createResourcePluginManager() { return new ResourcePluginManager(); @@ -431,12 +410,9 @@ protected void serviceInit(Configuration conf) throws Exception { // NodeManager level dispatcher this.dispatcher = createNMDispatcher(); - nodeHealthChecker = - new NodeHealthCheckerService( - getNodeHealthScriptRunner(conf), dirsHandler); + this.nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); addService(nodeHealthChecker); - ((NMContext)context).setContainerExecutor(exec); ((NMContext)context).setDeletionService(del); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 181094ea6c6..5e3693ae9c1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java new file mode 100644 index 00000000000..d290352bf9d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/DockerHealthCheckerService.java @@ -0,0 +1,165 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.Collection; +import java.util.List; +import java.util.Scanner; +import java.util.TimerTask; + +/** + * A {@link TimedHealthReporterService} responsible for regularly checking + * the availability of the Docker daemon. + */ +public class DockerHealthCheckerService extends TimedHealthReporterService { + + private static final Logger LOG = + LoggerFactory.getLogger(DockerHealthCheckerService.class); + + private boolean startupMode; + + static final String NO_PID_FILE = + "Unable to obtain pid file of Docker daemon"; + private static final String HEALTH_CHECK_DISABLED = + "Docker health checker service is disabled."; + + private DockerHealthCheckerService(boolean startupMode, long intervalMs) { + super(DockerHealthCheckerService.class.getName(), intervalMs); + + this.startupMode = startupMode; + setTimerTask(new DockerDaemonMonitorExecutor()); + } + + @VisibleForTesting + boolean getStartupMode() { + return startupMode; + } + + public static DockerHealthCheckerService newInstance(Configuration conf) { + boolean enabled = + conf.getBoolean(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_ENABLE, + YarnConfiguration.DEFAULT_NM_DOCKER_HEALTH_CHECKER_ENABLE); + if (!enabled) { + Collection runtimes = conf.getTrimmedStringCollection( + YarnConfiguration.LINUX_CONTAINER_RUNTIME_ALLOWED_RUNTIMES); + if (runtimes.contains("docker")) { + LOG.info(HEALTH_CHECK_DISABLED); + } else { + LOG.debug(HEALTH_CHECK_DISABLED); + } + return null; + } + LOG.info("Docker health checker service enabled"); + boolean startupMode = conf.getBoolean( + YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_STARTUP, + YarnConfiguration.DEFAULT_NM_DOCKER_HEALTH_CHECKER_STARTUP); + long intervalMs = conf.getLong( + YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS); + return new DockerHealthCheckerService(startupMode, intervalMs); + } + + class DockerDaemonMonitorExecutor extends TimerTask { + List getPossiblePidFileLocations() { + return ImmutableList.of( + new File("/var/run/docker.pid"), + new File("/var/docker.pid"), + new File("/run/docker.pid")); + } + + File getDockerDaemonConf() { + return new File("/etc/docker/daemon.json"); + } + + @Override + public void run() { + long now = System.currentTimeMillis(); + + // 1. Check whether docker.pid file exists on the default location + // 2. Try to guess some other OS-specific default location for docker.pid + List pidFiles = getPossiblePidFileLocations(); + for (File pidFile : pidFiles) { + if (pidFile.exists() && !pidFile.isDirectory()) { + setHealthStatus(true, "", now); + return; + } + } + + // 2. Let's check whether the pid file location is configured + // in the Docker daemon's json file + File dockerDaemonConf = getDockerDaemonConf(); + if (dockerDaemonConf.exists() && dockerDaemonConf.isFile()) { + try { + String jsonString = FileUtils.readFileToString(dockerDaemonConf, + Charset.defaultCharset()); + if (jsonString.contains("pidfile")) { + try (Scanner scanner = new Scanner(jsonString)) { + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + if (line.contains("pidfile")) { + String[] parts = line.split(":"); + if (parts.length >= 2) { + String pidFileLoc = parts[1].replace("\"", "").trim(); + File configuredPidFile = new File(pidFileLoc); + if (configuredPidFile.exists() && + !configuredPidFile.isDirectory()) { + setHealthStatus(true, "", now); + } else { + // give up trying, the pid file should be there, + // but it isn't + setHealthStatus(false, NO_PID_FILE); + } + } + } + } + } + } + } catch (IOException ignore) { + } + } + + // 3. Conclude that the Docker daemon is not running + setHealthStatus(false, NO_PID_FILE); + } + } + + @Override + public void serviceInit(Configuration conf) throws Exception { + super.serviceInit(conf); + if (startupMode) { + TimerTask task = new DockerDaemonMonitorExecutor(); + task.run(); + if (!isHealthy()) { + throw new Exception("Haven't detected running Docker daemon " + + "during startup!"); + } + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java new file mode 100644 index 00000000000..da56bcf8334 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/HealthReporter.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +/** + * Interface providing information about the healthiness of a service. + * + * Associated pieces of information: + *
    + *
  • whether the service is healthy
  • + *
  • report of the healthiness
  • + *
  • latest timestamp of the health check
  • + *
+ * + * Classes implementing this interface are used in + * {@link NodeHealthCheckerService}. + * + * @see TimedHealthReporterService + * @see org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService + */ +public interface HealthReporter { + + /** + * Gets whether the node is healthy or not. + * + * @return true if node is healthy + */ + boolean isHealthy(); + + /** + * Returns output from health check. If node is healthy then an empty string + * is returned. + * + * @return output from health check + */ + String getHealthReport(); + + /** + * Returns time stamp when node health check was last run. + * + * @return timestamp when node health script was last run + */ + long getLastHealthReportTime(); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java new file mode 100644 index 00000000000..92bcd4ddca9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthCheckerService.java @@ -0,0 +1,134 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Strings; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.CompositeService; +import org.apache.hadoop.service.Service; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * The class which provides functionality of checking the health of the node and + * reporting back to the service for which the health checker has been asked to + * report. + */ +public class NodeHealthCheckerService extends CompositeService { + + private List reporters; + private LocalDirsHandlerService dirsHandler; + private Exception nodeHealthException; + private long nodeHealthExceptionReportTime; + + public static final String SEPARATOR = ";"; + + public NodeHealthCheckerService(LocalDirsHandlerService dirHandlerService) { + super(NodeHealthCheckerService.class.getName()); + + this.reporters = new ArrayList<>(); + this.dirsHandler = dirHandlerService; + this.nodeHealthException = null; + this.nodeHealthExceptionReportTime = 0; + } + + @Override + protected void serviceInit(Configuration conf) throws Exception { + addHealthReporter(dirsHandler); + addHealthReporter(NodeHealthScriptRunner.newInstance(conf)); + addHealthReporter(DockerHealthCheckerService.newInstance(conf)); + + super.serviceInit(conf); + } + + /** + * Adds a {@link Service} implementing the {@link HealthReporter} interface, + * if that service has not been added to this {@link CompositeService} yet. + * + * @param service to add + * @throws Exception if not a {@link HealthReporter} + * implementation is provided to this function + */ + @VisibleForTesting + void addHealthReporter(Service service) throws Exception { + if (service != null && getServices().stream() + .noneMatch(x -> x.getName().equals(service.getName()))) { + if (!(service instanceof HealthReporter)) { + throw new Exception("Attempted to add service to " + + "NodeHealthCheckerService that is not implements HealthReporter."); + } + reporters.add((HealthReporter) service); + addService(service); + } + } + + /** + * @return the reporting string of health of the node + */ + public String getHealthReport() { + List reports = reporters.stream() + .map(reporter -> Strings.emptyToNull(reporter.getHealthReport())) + .collect(Collectors.toCollection(ArrayList::new)); + reports.add(Strings.emptyToNull( + nodeHealthException == null ? null : + nodeHealthException.getMessage())); + return Joiner.on(SEPARATOR).skipNulls().join(reports); + } + + /** + * @return true if the node is healthy + */ + public boolean isHealthy() { + return nodeHealthException == null && + reporters.stream().allMatch(HealthReporter::isHealthy); + } + + /** + * @return when the last time the node health status is reported + */ + public long getLastHealthReportTime() { + Optional max = reporters.stream() + .map(HealthReporter::getLastHealthReportTime).max(Long::compareTo); + return Long.max( + max.orElse(nodeHealthExceptionReportTime), + nodeHealthExceptionReportTime); + } + + /** + * @return the disk handler + */ + public LocalDirsHandlerService getDiskHandler() { + return dirsHandler; + } + + /** + * Report an exception to mark the node as unhealthy. + * @param ex the exception that makes the node unhealthy + */ + public void reportException(Exception ex) { + nodeHealthException = ex; + nodeHealthExceptionReportTime = System.currentTimeMillis(); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java similarity index 60% rename from hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java index f2a5b242a8d..8bab8bfc522 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/NodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.util; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.IOException; @@ -27,51 +27,59 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.Shell.ExitCodeException; import org.apache.hadoop.util.Shell.ShellCommandExecutor; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * * The class which provides functionality of checking the health of the node * using the configured node health script and reporting back to the service * for which the health checker has been asked to report. */ -public class NodeHealthScriptRunner extends AbstractService { +public class NodeHealthScriptRunner extends TimedHealthReporterService { private static final Logger LOG = LoggerFactory.getLogger(NodeHealthScriptRunner.class); /** Absolute path to the health script. */ private String nodeHealthScript; - /** Delay after which node health script to be executed */ - private long intervalTime; - /** Time after which the script should be timedout */ + /** Time after which the script should be timedout. */ private long scriptTimeout; - /** Timer used to schedule node health monitoring script execution */ - private Timer nodeHealthScriptScheduler; - - /** ShellCommandExecutor used to execute monitoring script */ - ShellCommandExecutor shexec = null; + /** ShellCommandExecutor used to execute monitoring script. */ + private ShellCommandExecutor shexec = null; /** Pattern used for searching in the output of the node health script */ - static private final String ERROR_PATTERN = "ERROR"; + private static final String ERROR_PATTERN = "ERROR"; /** Time out error message */ - public static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out"; - - private boolean isHealthy; - - private String healthReport; - - private long lastReportedTime; + static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = + "Node health script timed out"; + + public static NodeHealthScriptRunner newInstance(Configuration conf) { + String nodeHealthScript = + conf.get(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH); + if(!shouldRun(nodeHealthScript)) { + LOG.info("Node Manager health check script is not available " + + "or doesn't have execute permission, so not " + + "starting the node health script runner."); + return null; + } + long nmCheckintervalTime = conf.getLong( + YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS); + long scriptTimeout = conf.getLong( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS); + String[] scriptArgs = conf.getStrings( + YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS, new String[] {}); + return new NodeHealthScriptRunner(nodeHealthScript, + nmCheckintervalTime, scriptTimeout, scriptArgs); + } - private TimerTask timer; - private enum HealthCheckerExitStatus { SUCCESS, TIMED_OUT, @@ -84,13 +92,11 @@ /** * Class which is used by the {@link Timer} class to periodically execute the * node health script. - * */ private class NodeHealthMonitorExecutor extends TimerTask { + private String exceptionStackTrace = ""; - String exceptionStackTrace = ""; - - public NodeHealthMonitorExecutor(String[] args) { + NodeHealthMonitorExecutor(String[] args) { ArrayList execScript = new ArrayList(); execScript.add(nodeHealthScript); if (args != null) { @@ -134,20 +140,21 @@ public void run() { /** * Method which is used to parse output from the node health monitor and * send to the report address. - * + * * The timed out script or script which causes IOException output is * ignored. - * + * * The node is marked unhealthy if *
    *
  1. The node health script times out
  2. - *
  3. The node health scripts output has a line which begins with ERROR
  4. + *
  5. The node health scripts output has a line which begins + * with ERROR
  6. *
  7. An exception is thrown while executing the script
  8. *
* If the script throws {@link IOException} or {@link ExitCodeException} the * output is ignored and node is left remaining healthy, as script might * have syntax error. - * + * * @param status */ void reportHealthStatus(HealthCheckerExitStatus status) { @@ -169,12 +176,15 @@ void reportHealthStatus(HealthCheckerExitStatus status) { case FAILED: setHealthStatus(false, shexec.getOutput()); break; + default: + LOG.info("Unknown HealthCheckerExitStatus - ignored."); + break; } } /** * Method to check if the output string has line which begins with ERROR. - * + * * @param output * string * @return true if output string has error pattern in it. @@ -190,150 +200,42 @@ private boolean hasErrors(String output) { } } - public NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout, + NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout, String[] scriptArgs) { - super(NodeHealthScriptRunner.class.getName()); - this.lastReportedTime = System.currentTimeMillis(); - this.isHealthy = true; - this.healthReport = ""; + super(NodeHealthScriptRunner.class.getName(), chkInterval); + this.nodeHealthScript = scriptName; - this.intervalTime = chkInterval; this.scriptTimeout = timeout; - this.timer = new NodeHealthMonitorExecutor(scriptArgs); + setTimerTask(new NodeHealthMonitorExecutor(scriptArgs)); } - /* - * Method which initializes the values for the script path and interval time. - */ @Override - protected void serviceInit(Configuration conf) throws Exception { - super.serviceInit(conf); - } - - /** - * Method used to start the Node health monitoring. - * - */ - @Override - protected void serviceStart() throws Exception { - nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true); - // Start the timer task immediately and - // then periodically at interval time. - nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime); - super.serviceStart(); - } - - /** - * Method used to terminate the node health monitoring service. - * - */ - @Override - protected void serviceStop() { - if (nodeHealthScriptScheduler != null) { - nodeHealthScriptScheduler.cancel(); - } + public void serviceStop() throws Exception { if (shexec != null) { Process p = shexec.getProcess(); if (p != null) { p.destroy(); } } - } - - /** - * Gets the if the node is healthy or not - * - * @return true if node is healthy - */ - public boolean isHealthy() { - return isHealthy; - } - - /** - * Sets if the node is healthy or not considering disks' health also. - * - * @param isHealthy - * if or not node is healthy - */ - private synchronized void setHealthy(boolean isHealthy) { - this.isHealthy = isHealthy; - } - - /** - * Returns output from health script. if node is healthy then an empty string - * is returned. - * - * @return output from health script - */ - public String getHealthReport() { - return healthReport; - } - - /** - * Sets the health report from the node health script. Also set the disks' - * health info obtained from DiskHealthCheckerService. - * - * @param healthReport - */ - private synchronized void setHealthReport(String healthReport) { - this.healthReport = healthReport; - } - - /** - * Returns time stamp when node health script was last run. - * - * @return timestamp when node health script was last run - */ - public long getLastReportedTime() { - return lastReportedTime; - } - - /** - * Sets the last run time of the node health script. - * - * @param lastReportedTime - */ - private synchronized void setLastReportedTime(long lastReportedTime) { - this.lastReportedTime = lastReportedTime; + super.serviceStop(); } /** * Method used to determine if or not node health monitoring service should be * started or not. Returns true if following conditions are met: - * + * *
    *
  1. Path to Node health check script is not empty
  2. *
  3. Node health check script file exists
  4. *
- * + * * @return true if node health monitoring service can be started. */ - public static boolean shouldRun(String healthScript) { + static boolean shouldRun(String healthScript) { if (healthScript == null || healthScript.trim().isEmpty()) { return false; } File f = new File(healthScript); return f.exists() && FileUtil.canExecute(f); } - - private synchronized void setHealthStatus(boolean isHealthy, String output) { - LOG.info("health status being set as " + output); - this.setHealthy(isHealthy); - this.setHealthReport(output); - } - - private synchronized void setHealthStatus(boolean isHealthy, String output, - long time) { - LOG.info("health status being set as " + output); - this.setHealthStatus(isHealthy, output); - this.setLastReportedTime(time); - } - - /** - * Used only by tests to access the timer task directly - * @return the timer task - */ - public TimerTask getTimerTask() { - return timer; - } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java new file mode 100644 index 00000000000..6829ed597b9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.service.AbstractService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Timer; +import java.util.TimerTask; + +/** + * A {@link HealthReporter} skeleton for regularly checking a specific + * {@link TimerTask} and obtaining information about it. + */ +public abstract class TimedHealthReporterService extends AbstractService + implements HealthReporter { + + private static final Logger LOG = + LoggerFactory.getLogger(TimedHealthReporterService.class); + + private boolean isHealthy; + private String healthReport; + private long lastReportedTime; + + private Timer timer; + private TimerTask task; + private long intervalMs; + + TimedHealthReporterService(String name, long intervalMs) { + super(name); + this.isHealthy = true; + this.healthReport = ""; + this.lastReportedTime = System.currentTimeMillis(); + this.intervalMs = intervalMs; + } + + @VisibleForTesting + long getIntervalMs() { + return intervalMs; + } + + @VisibleForTesting + void setTimerTask(TimerTask task) { + this.task = task; + } + + @VisibleForTesting + TimerTask getTimerTask() { + return task; + } + + /** + * Method used to start the health monitoring. + */ + @Override + public void serviceStart() throws Exception { + if (task == null) { + throw new Exception("Health reporting task hasn't been set!"); + } + timer = new Timer("HealthReporterService-Timer", true); + timer.scheduleAtFixedRate(task, 0, intervalMs); + super.serviceStart(); + } + + /** + * Method used to terminate the health monitoring service. + */ + @Override + protected void serviceStop() throws Exception { + if (timer != null) { + timer.cancel(); + } + super.serviceStop(); + } + + @Override + public boolean isHealthy() { + return isHealthy; + } + + /** + * Sets if the node is healthy or not. + * + * @param healthy + * whether the node is healthy + */ + protected synchronized void setHealthy(boolean healthy) { + this.isHealthy = healthy; + } + + @Override + public String getHealthReport() { + return healthReport; + } + + /** + * Sets the health report from the node health check. Also set the disks' + * health info obtained from DiskHealthCheckerService. + * + * @param report + */ + private synchronized void setHealthReport(String report) { + this.healthReport = report; + } + + @Override + public long getLastHealthReportTime() { + return lastReportedTime; + } + + /** + * Sets the last run time of the node health check. + * + * @param lastReportedTime + */ + synchronized void setLastReportedTime(long lastReportedTime) { + this.lastReportedTime = lastReportedTime; + } + + synchronized void setHealthStatus(boolean healthy, String output) { + LOG.info("health status being set as " + output); + this.setHealthy(healthy); + this.setHealthReport(output); + } + + synchronized void setHealthStatus(boolean healthy, String output, + long time) { + LOG.info("health status being set as " + output); + this.setHealthStatus(healthy, output); + this.setLastReportedTime(time); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java index 2e80259d210..81d5fd23823 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/MockNodeStatusUpdater.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager; import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,6 +39,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java index 13b3ee91bdc..fcb76f9e75f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.junit.Assert; import org.junit.Before; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java index 54e090a29e2..b1fc2f1aa26 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java @@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; @@ -102,8 +103,8 @@ public int getHttpPort() { DeletionService del = new DeletionService(exec); Dispatcher dispatcher = new AsyncDispatcher(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); NodeManagerMetrics metrics = NodeManagerMetrics.create(); NodeStatusUpdater nodeStatusUpdater = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java index fbd3646940d..260c3c4144c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java @@ -64,6 +64,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.deletion.task.FileDeletionMatcher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java index 25cca876ac6..9eae82a9322 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java @@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java index 25dbc1dd2ea..9a0213d87cf 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java @@ -66,6 +66,7 @@ import org.apache.hadoop.yarn.security.NMTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.ConverterUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 1b21b936543..70317c73f32 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -107,6 +107,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java index 325d60c59be..072f4432c62 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForAttributes.java @@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java index a86ca3e8211..e3dce3b5b51 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdaterForLabels.java @@ -50,6 +50,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java index 15c1cac9cb8..b0756c40a31 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java @@ -75,7 +75,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; @@ -218,8 +218,7 @@ public void setup() throws IOException { delSrvc.init(conf); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); nodeHealthChecker.init(conf); containerManager = createContainerManager(delSrvc); ((NMContext)context).setContainerManager(containerManager); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index e920105abf9..00ce28b1720 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -85,8 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeManager; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -106,7 +105,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler.ContainerScheduler; - import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.TestNodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService; @@ -157,8 +155,7 @@ public void setup() throws IOException { delSrvc.init(conf); exec = createContainerExecutor(); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); nodeHealthChecker.init(conf); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java index 28f917fd842..784a73c44d9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java @@ -30,7 +30,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java new file mode 100644 index 00000000000..c66b14862cb --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestDockerHealthCheckerService.java @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.health; + +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.List; +import java.util.TimerTask; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.spy; + +public class TestDockerHealthCheckerService { + + private static final File TEST_ROOT_DIR = new File("target", + TestDockerHealthCheckerService.class.getName() + + "-localDir").getAbsoluteFile(); + + private static final File PID_FILE = new File(TEST_ROOT_DIR, "docker.pid"); + private static final File JSON_CONF = new File(TEST_ROOT_DIR, "daemon.json"); + + @Before + public void setup() { + assertThat(TEST_ROOT_DIR.mkdirs()).isTrue(); + } + + @After + public void tearDown() throws Exception { + if (TEST_ROOT_DIR.exists()) { + FileContext.getLocalFSFileContext().delete( + new Path(TEST_ROOT_DIR.getAbsolutePath()), true); + } + } + + /*private void setupFolderWithFile(File file, String content) { + }*/ + + @Test + public void checkNewInstanceCreation() { + Configuration conf = new Configuration(); + DockerHealthCheckerService service = + DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNull(); + + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_ENABLE, "true"); + service = DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + assertThat(service.getStartupMode()).isFalse(); + assertThat(service.getIntervalMs()).isEqualTo(600000); + + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_STARTUP, "true"); + service = DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + assertThat(service.getStartupMode()).isTrue(); + assertThat(service.getIntervalMs()).isEqualTo(600000); + + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_INTERVAL_MS, "16500"); + service = DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + assertThat(service.getStartupMode()).isTrue(); + assertThat(service.getIntervalMs()).isEqualTo(16500); + } + + @Test + public void testHealthChecker() throws Exception { + Configuration conf = new Configuration(); + conf.set(YarnConfiguration.NM_DOCKER_HEALTH_CHECKER_ENABLE, "true"); + + DockerHealthCheckerService service = + DockerHealthCheckerService.newInstance(conf); + assertThat(service).isNotNull(); + service.serviceInit(conf); + + TimerTask task = service.getTimerTask(); + assertThat(task).isInstanceOf( + DockerHealthCheckerService.DockerDaemonMonitorExecutor.class); + DockerHealthCheckerService.DockerDaemonMonitorExecutor executor = + spy((DockerHealthCheckerService.DockerDaemonMonitorExecutor) task); + + // spy + List files = ImmutableList.of(new File("/path/to/file")); + doReturn(files).when(executor.getPossiblePidFileLocations()); + service.setTimerTask(task); + + // no pid or json file + task.run(); + assertThat(service.isHealthy()).isFalse(); + assertThat(service.getHealthReport()) + .isEqualTo(DockerHealthCheckerService.NO_PID_FILE); + + // existing json file without pid file location configured + // do something + + // existing json file with existing pid file location configured + + // existing json file without existing pid file location configured + + // existing pid file + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java similarity index 90% rename from hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java index 2748c0b581a..cfb6770fbc4 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestNodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthScriptRunner.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.util; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.FileOutputStream; @@ -28,14 +28,18 @@ import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Shell; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +/** + * Test class for {@link NodeHealthScriptRunner}. + */ public class TestNodeHealthScriptRunner { - protected static File testRootDir = new File("target", + private static File testRootDir = new File("target", TestNodeHealthScriptRunner.class.getName() + "-localDir").getAbsoluteFile(); @@ -55,8 +59,8 @@ public void tearDown() throws Exception { } } - private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) - throws IOException { + private void writeNodeHealthScriptFile(String scriptStr, + boolean setExecutable) throws IOException { PrintWriter pw = null; try { FileUtil.setWritable(nodeHealthscriptFile, true); @@ -92,8 +96,8 @@ public void testNodeHealthScript() throws Exception { String errorScript = "echo ERROR\n echo \"Tracker not healthy\""; String normalScript = "echo \"I am all fine\""; String timeOutScript = - Shell.WINDOWS ? "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" - : "sleep 4\necho \"I am fine\""; + Shell.WINDOWS ? "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" + : "sleep 4\necho \"I am fine\""; String exitCodeScript = "exit 127"; Configuration conf = new Configuration(); @@ -118,7 +122,7 @@ public void testNodeHealthScript() throws Exception { nodeHealthScriptRunner.isHealthy()); Assert.assertTrue( nodeHealthScriptRunner.getHealthReport().contains("ERROR")); - + // Healthy script. writeNodeHealthScriptFile(normalScript, true); timerTask.run(); @@ -130,7 +134,7 @@ public void testNodeHealthScript() throws Exception { writeNodeHealthScriptFile(timeOutScript, true); timerTask.run(); Assert.assertFalse("Node health status reported healthy even after timeout", - nodeHealthScriptRunner.isHealthy()); + nodeHealthScriptRunner.isHealthy()); Assert.assertEquals( NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG, nodeHealthScriptRunner.getHealthReport()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthService.java similarity index 88% rename from hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthService.java index 8083a563773..5c54d9108c6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/health/TestNodeHealthService.java @@ -16,12 +16,14 @@ * limitations under the License. */ -package org.apache.hadoop.yarn.server.nodemanager; +package org.apache.hadoop.yarn.server.nodemanager.health; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; + +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,7 +33,6 @@ import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.factories.RecordFactory; @@ -42,18 +43,19 @@ import org.junit.Before; import org.junit.Test; +import static org.junit.Assert.fail; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.spy; public class TestNodeHealthService { - private static volatile Logger LOG = + private static final Logger LOG = LoggerFactory.getLogger(TestNodeHealthService.class); - protected static File testRootDir = new File("target", + private static File testRootDir = new File("target", TestNodeHealthService.class.getName() + "-localDir").getAbsoluteFile(); - final static File nodeHealthConfigFile = new File(testRootDir, + private final static File nodeHealthConfigFile = new File(testRootDir, "modified-mapred-site.xml"); private File nodeHealthscriptFile = new File(testRootDir, @@ -71,9 +73,9 @@ public void tearDown() throws Exception { new Path(testRootDir.getAbsolutePath()), true); } } - - private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) - throws IOException { + + private void writeNodeHealthScriptFile(String scriptStr, + boolean setExecutable) throws IOException { PrintWriter pw = null; try { FileUtil.setWritable(nodeHealthscriptFile, true); @@ -116,9 +118,14 @@ public void testNodeHealthService() throws Exception { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthScriptRunner nodeHealthScriptRunner = - spy(NodeManager.getNodeHealthScriptRunner(conf)); - NodeHealthCheckerService nodeHealthChecker = new NodeHealthCheckerService( - nodeHealthScriptRunner, dirsHandler); + NodeHealthScriptRunner.newInstance(conf); + if (nodeHealthScriptRunner == null) { + fail("Should have created NodeHealthScriptRunner instance"); + } + nodeHealthScriptRunner = spy(nodeHealthScriptRunner); + NodeHealthCheckerService nodeHealthChecker = + new NodeHealthCheckerService(dirsHandler); + nodeHealthChecker.addHealthReporter(nodeHealthScriptRunner); nodeHealthChecker.init(conf); doReturn(true).when(nodeHealthScriptRunner).isHealthy(); @@ -133,7 +140,7 @@ public void testNodeHealthService() throws Exception { Assert.assertTrue("Node health status reported unhealthy", healthStatus .getHealthReport().equals(nodeHealthChecker.getHealthReport())); - doReturn(false).when(nodeHealthScriptRunner).isHealthy(); + doReturn(false).when(nodeHealthScriptRunner).isHealthy(); // update health status setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(), nodeHealthChecker.getHealthReport(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java index ece1af4a260..a6f7cec5d8a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java @@ -43,7 +43,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -57,7 +56,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -80,9 +79,8 @@ public class TestContainerLogsPage { private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } @Test(timeout=30000) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java index 0d618fde10f..914bda7018b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java @@ -20,11 +20,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -105,8 +104,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService( - conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); @@ -120,12 +118,9 @@ public boolean isPmemCheckEnabled() { } } - private NodeHealthCheckerService createNodeHealthCheckerService( - Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner( - conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java index 0a71a9179bb..232cc127044 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java @@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -42,7 +41,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -54,7 +53,6 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; -import org.apache.hadoop.yarn.util.ConverterUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -79,10 +77,9 @@ public void tearDown() { FileUtil.fullyDelete(testLogDir); } - private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } private int startNMWebAppServer(String webAddr) { @@ -113,7 +110,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); @@ -176,7 +173,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); - NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); + NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java index ad17ae81322..f591bad00d8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java @@ -47,7 +47,7 @@ import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; @@ -141,8 +141,8 @@ protected void configureServlets() { conf.set(YarnConfiguration.YARN_LOG_SERVER_WEBSERVICE_URL, LOGSERVICEWSADDR); dirsHandler = new LocalDirsHandlerService(); - NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + NodeHealthCheckerService healthChecker = + new NodeHealthCheckerService(dirsHandler); healthChecker.init(conf); aclsManager = new ApplicationACLsManager(conf); nmContext = new NodeManager.NMContext(null, null, dirsHandler, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java index 3533d16849d..041036750fe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java @@ -47,13 +47,13 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AppsInfo; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; @@ -105,7 +105,7 @@ protected void configureServlets() { conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java index 4ee63db8177..cd476c25338 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java @@ -44,11 +44,11 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices; import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; @@ -125,7 +125,7 @@ public boolean isPmemCheckEnabled() { conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java index a99ce280381..aacdf4c7138 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java @@ -28,7 +28,6 @@ import java.io.File; import java.io.IOException; import java.io.StringReader; -import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -48,16 +47,15 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; -import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; import org.apache.hadoop.yarn.webapp.GuiceServletConfig; import org.apache.hadoop.yarn.webapp.JerseyTestBase; @@ -132,7 +130,7 @@ public boolean isPmemCheckEnabled() { conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( - NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); + dirsHandler); healthChecker.init(conf); dirsHandler = healthChecker.getDiskHandler(); aclsManager = new ApplicationACLsManager(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java index ec7d62c803c..d4180e48251 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebTerminal.java @@ -26,13 +26,12 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.http.JettyUtils; -import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.ResourceView; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.junit.After; import org.junit.Before; @@ -54,12 +53,9 @@ private WebServer server; private int port; - private NodeHealthCheckerService createNodeHealthCheckerService( - Configuration conf) { - NodeHealthScriptRunner scriptRunner = NodeManager - .getNodeHealthScriptRunner(conf); + private NodeHealthCheckerService createNodeHealthCheckerService() { LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); - return new NodeHealthCheckerService(scriptRunner, dirsHandler); + return new NodeHealthCheckerService(dirsHandler); } private int startNMWebAppServer(String webAddr) { @@ -90,7 +86,7 @@ public boolean isPmemCheckEnabled() { }; conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, TESTLOGDIR.getAbsolutePath()); - healthChecker = createNodeHealthCheckerService(conf); + healthChecker = createNodeHealthCheckerService(); healthChecker.init(conf); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java index 68d97ee32ac..fdcefcbc350 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java @@ -70,7 +70,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; -import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md index e4ed57f5cb4..131f0520005 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md @@ -42,6 +42,17 @@ The following configuration parameters can be used to modify the disk checks: | `yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage` | Float between 0-100 | The maximum percentage of disk space that may be utilized before a disk is marked as unhealthy by the disk checker service. This check is run for every disk used by the NodeManager. The default value is 90 i.e. 90% of the disk can be used. | | `yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb` | Integer | The minimum amount of free space that must be available on the disk for the disk checker service to mark the disk as healthy. This check is run for every disk used by the NodeManager. The default value is 0 i.e. the entire disk can be used. | + +### Docker health checker + +If the Docker on Yarn feature is enabled, you can enabled the Docker health checker service. The Docker health checker checks the availability of the Docker daemon in the host, marking the node as unhealthy if the daemon is not responding. + +| Configuration Name | Allowed Values | Description | +|:---- |:---- |:---- | +| `yarn.nodemanager.docker-health-checker.enable` | true, false | Enable or disable the Docker health checker service. Default is false | +| `yarn.nodemanager.docker-health-checker.startup` | true, false | The NodeManager will fail to come up if the Docker daemon is not responding during startup. Default is false | +| `yarn.nodemanager.docker-health-checker.interval-ms` | Positive integer | The interval, in milliseconds, at which the Docker checker should run; the default value is 10 minutes | + ###External Health Script Users may specify their own health checker script that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that: -- 2.21.0