diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index d525e4d2f4e..59c5aad1313 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1992,7 +1992,10 @@ public static boolean isAclEnabled(Configuration conf) { * A configurable value to pass to the Docker Stop command. This value * defines the number of seconds between the docker stop command sending * a SIGTERM and a SIGKILL. + * + * @deprecated use {@link YarnConfiguration#NM_SLEEP_DELAY_BEFORE_SIGKILL_MS} */ + @Deprecated public static final String NM_DOCKER_STOP_GRACE_PERIOD = DOCKER_CONTAINER_RUNTIME_PREFIX + "stop.grace-period"; @@ -2000,6 +2003,7 @@ public static boolean isAclEnabled(Configuration conf) { * The default value for the grace period between the SIGTERM and the * SIGKILL in the Docker Stop command. */ + @Deprecated public static final int DEFAULT_NM_DOCKER_STOP_GRACE_PERIOD = 10; /** The default list of read-only mounts to be bind-mounted into all diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java index 0ae3d0fa755..9591e4602f6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java @@ -58,7 +58,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerClient; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerInspectCommand; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerStopCommand; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntime; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; @@ -245,7 +244,6 @@ private int userRemappingGidThreshold; private Set capabilities; private boolean delayedRemovalAllowed; - private int dockerStopGracePeriod; private Set defaultROMounts = new HashSet<>(); private Set defaultRWMounts = new HashSet<>(); private Set defaultTmpfsMounts = new HashSet<>(); @@ -356,10 +354,6 @@ public void initialize(Configuration conf, Context nmContext) YarnConfiguration.NM_DOCKER_ALLOW_DELAYED_REMOVAL, YarnConfiguration.DEFAULT_NM_DOCKER_ALLOW_DELAYED_REMOVAL); - dockerStopGracePeriod = conf.getInt( - YarnConfiguration.NM_DOCKER_STOP_GRACE_PERIOD, - YarnConfiguration.DEFAULT_NM_DOCKER_STOP_GRACE_PERIOD); - defaultROMounts.addAll(Arrays.asList( conf.getTrimmedStrings( YarnConfiguration.NM_DOCKER_DEFAULT_RO_MOUNTS))); @@ -1079,7 +1073,7 @@ public void signalContainer(ContainerRuntimeContext ctx) if (ContainerExecutor.Signal.NULL.equals(signal)) { executeLivelinessCheck(ctx); } else if (ContainerExecutor.Signal.TERM.equals(signal)) { - String containerId = ctx.getContainer().getContainerId().toString(); + ContainerId containerId = ctx.getContainer().getContainerId(); handleContainerStop(containerId, env); } else { handleContainerKill(ctx, env, signal); @@ -1132,14 +1126,7 @@ public void reapContainer(ContainerRuntimeContext ctx) DockerInspectCommand inspectCommand = new DockerInspectCommand(containerIdStr).getIpAndHost(); try { - String commandFile = dockerClient.writeCommandToTempFile(inspectCommand, - containerId, nmContext); - PrivilegedOperation privOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.RUN_DOCKER_CMD); - privOp.appendArgs(commandFile); - String output = privilegedOperationExecutor - .executePrivilegedOperation(null, privOp, null, - null, true, false); + String output = executeDockerInspect(containerId, inspectCommand); LOG.info("Docker inspect output for " + containerId + ": " + output); // strip off quotes if any output = output.replaceAll("['\"]", ""); @@ -1261,25 +1248,76 @@ private void executeLivelinessCheck(ContainerRuntimeContext ctx) } } - private void handleContainerStop(String containerId, Map env) + /** + * Handles a docker container stop by first finding the {@code STOPSIGNAL} + * using docker inspect and then executing + * {@code docker kill --signal=}. + * It doesn't rely on the docker stop because that sends a {@code SIGKILL} + * to the root process in the container after the {@code STOPSIGNAL}.The grace + * period which the docker stop uses has granularity in seconds. However, NM + * is designed to explicitly send a {@code SIGKILL} to the containers after a + * grace period which has a granularity of millis. It doesn't want the docker + * stop to send {@code SIGKILL} but docker stop has no option to disallow + * that. + * + * @param containerId container id + * @param env env + * @throws ContainerExecutionException + */ + private void handleContainerStop(ContainerId containerId, + Map env) throws ContainerExecutionException { + DockerCommandExecutor.DockerContainerStatus containerStatus = - DockerCommandExecutor.getContainerStatus(containerId, - privilegedOperationExecutor, nmContext); + DockerCommandExecutor.DockerContainerStatus.UNKNOWN; + String stopSignal = ContainerExecutor.Signal.TERM.toString(); + char delimiter = ','; + DockerInspectCommand inspectCommand = + new DockerInspectCommand(containerId.toString()).get(new String[] { + DockerInspectCommand.STATUS_TEMPLATE, + DockerInspectCommand.STOPSIGNAL_TEMPLATE}, delimiter); + try { + String output = executeDockerInspect(containerId, inspectCommand); + + if (!output.isEmpty()) { + String[] statusAndSignal = StringUtils.split(output, delimiter); + containerStatus = DockerCommandExecutor.parseContainerStatus( + statusAndSignal[0]); + if (statusAndSignal.length > 1) { + stopSignal = statusAndSignal[1]; + } + } + } catch (ContainerExecutionException | PrivilegedOperationException e) { + LOG.debug("{} inspect failed, skipping stop", containerId, e); + return; + } + if (DockerCommandExecutor.isStoppable(containerStatus)) { - DockerStopCommand dockerStopCommand = new DockerStopCommand( - containerId).setGracePeriod(dockerStopGracePeriod); - DockerCommandExecutor.executeDockerCommand(dockerStopCommand, containerId, - env, privilegedOperationExecutor, false, nmContext); + + DockerKillCommand dockerStopCommand = new DockerKillCommand( + containerId.toString()).setSignal(stopSignal); + DockerCommandExecutor.executeDockerCommand(dockerStopCommand, + containerId.toString(), env, privilegedOperationExecutor, false, + nmContext); } else { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Container status is " + containerStatus.getName() - + ", skipping stop - " + containerId); - } + LOG.debug("{} status is {}, skipping stop", containerId, containerStatus); } } + private String executeDockerInspect(ContainerId containerId, + DockerInspectCommand inspectCommand) throws ContainerExecutionException, + PrivilegedOperationException { + String commandFile = dockerClient.writeCommandToTempFile(inspectCommand, + containerId, nmContext); + PrivilegedOperation privOp = new PrivilegedOperation( + PrivilegedOperation.OperationType.RUN_DOCKER_CMD); + privOp.appendArgs(commandFile); + String output = privilegedOperationExecutor.executePrivilegedOperation(null, + privOp, null, null, true, false); + LOG.info("{} : docker inspect output {} ", containerId, output); + return output; + } + private void handleContainerKill(ContainerRuntimeContext ctx, Map env, ContainerExecutor.Signal signal) throws ContainerExecutionException { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerCommandExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerCommandExecutor.java index 7b6497cd184..f449f730732 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerCommandExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerCommandExecutor.java @@ -113,39 +113,11 @@ public static DockerContainerStatus getContainerStatus(String containerId, PrivilegedOperationExecutor privilegedOperationExecutor, Context nmContext) { try { - DockerContainerStatus dockerContainerStatus; String currentContainerStatus = executeStatusCommand(containerId, privilegedOperationExecutor, nmContext); - if (currentContainerStatus == null) { - dockerContainerStatus = DockerContainerStatus.UNKNOWN; - } else if (currentContainerStatus - .equals(DockerContainerStatus.CREATED.getName())) { - dockerContainerStatus = DockerContainerStatus.CREATED; - } else if (currentContainerStatus - .equals(DockerContainerStatus.RUNNING.getName())) { - dockerContainerStatus = DockerContainerStatus.RUNNING; - } else if (currentContainerStatus - .equals(DockerContainerStatus.STOPPED.getName())) { - dockerContainerStatus = DockerContainerStatus.STOPPED; - } else if (currentContainerStatus - .equals(DockerContainerStatus.RESTARTING.getName())) { - dockerContainerStatus = DockerContainerStatus.RESTARTING; - } else if (currentContainerStatus - .equals(DockerContainerStatus.REMOVING.getName())) { - dockerContainerStatus = DockerContainerStatus.REMOVING; - } else if (currentContainerStatus - .equals(DockerContainerStatus.DEAD.getName())) { - dockerContainerStatus = DockerContainerStatus.DEAD; - } else if (currentContainerStatus - .equals(DockerContainerStatus.EXITED.getName())) { - dockerContainerStatus = DockerContainerStatus.EXITED; - } else if (currentContainerStatus - .equals(DockerContainerStatus.NONEXISTENT.getName())) { - dockerContainerStatus = DockerContainerStatus.NONEXISTENT; - } else { - dockerContainerStatus = DockerContainerStatus.UNKNOWN; - } + DockerContainerStatus dockerContainerStatus = parseContainerStatus( + currentContainerStatus); if (LOG.isDebugEnabled()) { LOG.debug("Container Status: " + dockerContainerStatus.getName() + " ContainerId: " + containerId); @@ -161,6 +133,47 @@ public static DockerContainerStatus getContainerStatus(String containerId, } } + /** + * Parses the container status string. + * + * @param containerStatusStr container status. + * @return a {@link DockerContainerStatus} representing the status. + */ + public static DockerContainerStatus parseContainerStatus( + String containerStatusStr) { + DockerContainerStatus dockerContainerStatus; + if (containerStatusStr == null) { + dockerContainerStatus = DockerContainerStatus.UNKNOWN; + } else if (containerStatusStr + .equals(DockerContainerStatus.CREATED.getName())) { + dockerContainerStatus = DockerContainerStatus.CREATED; + } else if (containerStatusStr + .equals(DockerContainerStatus.RUNNING.getName())) { + dockerContainerStatus = DockerContainerStatus.RUNNING; + } else if (containerStatusStr + .equals(DockerContainerStatus.STOPPED.getName())) { + dockerContainerStatus = DockerContainerStatus.STOPPED; + } else if (containerStatusStr + .equals(DockerContainerStatus.RESTARTING.getName())) { + dockerContainerStatus = DockerContainerStatus.RESTARTING; + } else if (containerStatusStr + .equals(DockerContainerStatus.REMOVING.getName())) { + dockerContainerStatus = DockerContainerStatus.REMOVING; + } else if (containerStatusStr + .equals(DockerContainerStatus.DEAD.getName())) { + dockerContainerStatus = DockerContainerStatus.DEAD; + } else if (containerStatusStr + .equals(DockerContainerStatus.EXITED.getName())) { + dockerContainerStatus = DockerContainerStatus.EXITED; + } else if (containerStatusStr + .equals(DockerContainerStatus.NONEXISTENT.getName())) { + dockerContainerStatus = DockerContainerStatus.NONEXISTENT; + } else { + dockerContainerStatus = DockerContainerStatus.UNKNOWN; + } + return dockerContainerStatus; + } + /** * Execute the docker inspect command to retrieve the docker container's * status. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerInspectCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerInspectCommand.java index e9461610e0d..f457e12a376 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerInspectCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerInspectCommand.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker; +import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; @@ -39,8 +40,8 @@ public DockerInspectCommand(String containerName) { } public DockerInspectCommand getContainerStatus() { - super.addCommandArguments("format", "{{.State.Status}}"); - this.commandArguments = "--format={{.State.Status}}"; + super.addCommandArguments("format", STATUS_TEMPLATE); + this.commandArguments = String.format("--format=%s", STATUS_TEMPLATE); return this; } @@ -54,6 +55,14 @@ public DockerInspectCommand getIpAndHost() { + "{{.IPAddress}},{{end}}{{.Config.Hostname}}"; return this; } + + public DockerInspectCommand get(String[] templates, char delimiter) { + String format = StringUtils.join(delimiter, templates); + super.addCommandArguments("format", format); + this.commandArguments = String.format("--format=%s", format); + return this; + } + @Override public PrivilegedOperation preparePrivilegedOperation( DockerCommand dockerCommand, String containerName, Map dockerCommands = getDockerCommandsForDockerStop( ContainerExecutor.Signal.TERM); - Assert.assertEquals(4, dockerCommands.size()); - Assert.assertEquals("[docker-command-execution]", dockerCommands.get(0)); - Assert.assertEquals(" docker-command=stop", dockerCommands.get(1)); - Assert.assertEquals( - " name=container_e11_1518975676334_14532816_01_000001", - dockerCommands.get(2)); - Assert.assertEquals(" time=10", dockerCommands.get(3)); + verifyStopCommand(dockerCommands, ContainerExecutor.Signal.TERM.toString()); + } + + @Test + @SuppressWarnings("unchecked") + public void testDockerStopWithQuitSignalWhenRunning() + throws ContainerExecutionException, PrivilegedOperationException, + IOException { + when(mockExecutor + .executePrivilegedOperation(anyList(), any(PrivilegedOperation.class), + any(File.class), anyMap(), anyBoolean(), anyBoolean())).thenReturn( + DockerCommandExecutor.DockerContainerStatus.RUNNING.getName() + + ",SIGQUIT"); + + List dockerCommands = getDockerCommandsForDockerStop( + ContainerExecutor.Signal.TERM); + verifyStopCommand(dockerCommands, "SIGQUIT"); } @Test @@ -1714,13 +1719,7 @@ public void testDockerStopOnTermSignalWhenRunningPrivileged() DockerCommandExecutor.DockerContainerStatus.RUNNING.getName()); List dockerCommands = getDockerCommandsForDockerStop( ContainerExecutor.Signal.TERM); - Assert.assertEquals(4, dockerCommands.size()); - Assert.assertEquals("[docker-command-execution]", dockerCommands.get(0)); - Assert.assertEquals(" docker-command=stop", dockerCommands.get(1)); - Assert.assertEquals( - " name=container_e11_1518975676334_14532816_01_000001", - dockerCommands.get(2)); - Assert.assertEquals(" time=10", dockerCommands.get(3)); + verifyStopCommand(dockerCommands, ContainerExecutor.Signal.TERM.toString()); } @Test @@ -2351,4 +2350,14 @@ public void testDockerContainerRelaunch() " name=container_e11_1518975676334_14532816_01_000001", dockerCommands.get(counter)); } + + private static void verifyStopCommand(List dockerCommands, + String signal) { + Assert.assertEquals(4, dockerCommands.size()); + Assert.assertEquals("[docker-command-execution]", dockerCommands.get(0)); + Assert.assertEquals(" docker-command=kill", dockerCommands.get(1)); + Assert.assertEquals(" name=container_e11_1518975676334_14532816_01_000001", + dockerCommands.get(2)); + Assert.assertEquals(" signal=" + signal, dockerCommands.get(3)); + } }