diff --git a/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg b/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg index 023654b7dba..7a84d7648f7 100644 --- a/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg +++ b/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg @@ -14,3 +14,4 @@ feature.tc.enabled=0 # docker.allowed.ro-mounts=## comma seperated volumes that can be mounted as read-only # docker.allowed.rw-mounts=## comma seperate volumes that can be mounted as read-write, add the yarn local and log dirs to this list to run Hadoop jobs # docker.privileged-containers.enabled=0 +# docker.allowed.volume-drivers=## comma seperated list of allowed volume-drivers diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 37934691424..fd4abdfeb51 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1404,6 +1404,17 @@ public static boolean isAclEnabled(Configuration conf) { @Private public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; + /** + * This setting controls end point of nvidia-docker-plugin + */ + @Private + public static final String NVIDIA_DOCKER_PLUGIN_ENDPOINT = + NM_GPU_RESOURCE_PREFIX + "nvidia-docker-plugin-endpoint"; + + @Private + public static final String DEFAULT_NVIDIA_DOCKER_PLUGIN_ENDPOINT = + "http://localhost:3476/v1.0/docker/cli"; + /** NM Webapp address.**/ public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 865d18df009..5559144b029 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -20,7 +20,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java index 20133062c4e..c70df602df8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java @@ -21,6 +21,9 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -172,6 +175,7 @@ "YARN_CONTAINER_RUNTIME_DOCKER_LOCAL_RESOURCE_MOUNTS"; private Configuration conf; + private Context nmContext; private DockerClient dockerClient; private PrivilegedOperationExecutor privilegedOperationExecutor; private Set allowedNetworks = new HashSet<>(); @@ -242,6 +246,7 @@ public DockerLinuxContainerRuntime(PrivilegedOperationExecutor public void initialize(Configuration conf) throws ContainerExecutionException { this.conf = conf; + this.nmContext = nmContext; dockerClient = new DockerClient(conf); allowedNetworks.clear(); allowedNetworks.addAll(Arrays.asList( @@ -623,6 +628,19 @@ public void launchContainer(ContainerRuntimeContext ctx) runCommand.groupAdd(groups); } + // use plugins to update docker run command. + if (nmContext != null + && nmContext.getResourcePluginManager().getNameToPlugins() != null) { + for (ResourcePlugin plugin : nmContext.getResourcePluginManager() + .getNameToPlugins().values()) { + DockerCommandPlugin dockerCommandPlugin = + plugin.getDockerCommandPluginInstance(); + if (dockerCommandPlugin != null) { + dockerCommandPlugin.updateDockerRunCommand(runCommand, container); + } + } + } + String commandFile = dockerClient.writeCommandToTempFile(runCommand, containerIdStr); PrivilegedOperation launchOp = buildLaunchOp(ctx, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java index c7bf827f545..8734ba6b8de 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java @@ -76,6 +76,11 @@ public DockerRunCommand addReadOnlyMountLocation(String sourcePath, String return this; } + public DockerRunCommand setVolumeDriver(String volumeDriver) { + super.addCommandArguments("volume-driver", volumeDriver); + return this; + } + public DockerRunCommand setCGroupParent(String parentPath) { super.addCommandArguments("cgroup-parent", parentPath); return this; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/DockerCommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/DockerCommandPlugin.java new file mode 100644 index 00000000000..7662a4cc9cf --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/DockerCommandPlugin.java @@ -0,0 +1,22 @@ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; + +/** + * Interface to make different resource plugins (e.g. GPU) can update docker run + * command without adding logic to Docker runtime. + */ +public interface DockerCommandPlugin { + /** + * Update docker run command + * @param dockerRunCommand docker run command + * @param container NM container + * @throws ContainerExecutionException if any issue occurs + */ + void updateDockerRunCommand(DockerRunCommand dockerRunCommand, + Container container) throws ContainerExecutionException; + + // Add support to other docker command when required. +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java index 46b910c4d16..c8949bf9998 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java @@ -24,6 +24,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DockerLinuxContainerRuntime; /** * {@link ResourcePlugin} is an interface for node manager to easier support @@ -80,4 +81,14 @@ ResourceHandler createResourceHandler(Context nmContext, * @throws YarnException */ void cleanup() throws YarnException; + + /** + * Plugin need to get {@link DockerCommandPlugin}. This will be invoked by + * {@link DockerLinuxContainerRuntime} when execute docker commands such as + * run/stop/pull, etc. + * + * @return DockerCommandPlugin instance. return null if plugin doesn't + * have requirement to update docker command. + */ + DockerCommandPlugin getDockerCommandPluginInstance(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPlugin.java new file mode 100644 index 00000000000..cb0051b16d6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPlugin.java @@ -0,0 +1,204 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; + +import java.io.Serializable; +import java.io.StringWriter; +import java.net.URL; +import java.net.URLConnection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class GpuDockerCommandPlugin implements DockerCommandPlugin { + final static Log LOG = LogFactory + .getLog(GpuDockerCommandPlugin.class); + + private Configuration conf; + private Map> additionalCommands = null; + + // Known option + private String DEVICE_OPTION = "--device"; + private String VOLUME_DRIVER_OPTION = "--volume-driver"; + private String MOUNT_RO_OPTION = "--volume"; + + public GpuDockerCommandPlugin(Configuration conf) { + this.conf = conf; + } + + // Get value from key=value + // Throw exception if '=' not found + private String getValue(String input) throws IllegalArgumentException { + int index = input.indexOf('='); + if (index < 0) { + throw new IllegalArgumentException( + "Failed to locate '=' from input=" + input); + } + return input.substring(index + 1); + } + + private void addToCommand(String key, String value) { + if (additionalCommands == null) { + additionalCommands = new HashMap<>(); + } + if (!additionalCommands.containsKey(key)) { + additionalCommands.put(key, new HashSet<>()); + } + additionalCommands.get(key).add(value); + } + + private void init() throws ContainerExecutionException { + String endpoint = conf.get(YarnConfiguration.NVIDIA_DOCKER_PLUGIN_ENDPOINT, + YarnConfiguration.DEFAULT_NVIDIA_DOCKER_PLUGIN_ENDPOINT); + if (null == endpoint || endpoint.isEmpty()) { + LOG.info(YarnConfiguration.NVIDIA_DOCKER_PLUGIN_ENDPOINT + + " set to empty, skip init .."); + return; + } else { + String cliOptions; + try { + // Talk to plugin server and get options + URL url = new URL(endpoint); + URLConnection uc = url.openConnection(); + uc.setRequestProperty("X-Requested-With", "Curl"); + + StringWriter writer = new StringWriter(); + IOUtils.copy(uc.getInputStream(), writer, "utf-8"); + cliOptions = writer.toString(); + + LOG.info("Additional docker CLI options from plugin to run GPU " + + "containers:" + cliOptions); + + // Parse cli options + // Examples like: + // --device=/dev/nvidiactl --device=/dev/nvidia-uvm --device=/dev/nvidia0 + // --volume-driver=nvidia-docker + // --volume=nvidia_driver_352.68:/usr/local/nvidia:ro + + for (String str : cliOptions.split(" ")) { + str = str.trim(); + if (str.startsWith(DEVICE_OPTION)) { + addToCommand(DEVICE_OPTION, getValue(str)); + } else if (str.startsWith(VOLUME_DRIVER_OPTION)) { + addToCommand(VOLUME_DRIVER_OPTION, getValue(str)); + } else if (str.startsWith(MOUNT_RO_OPTION)) { + String mount = getValue(str); + if (!mount.endsWith(":ro")) { + throw new IllegalArgumentException( + "Should not have mount other than ro, command=" + str); + } + addToCommand(MOUNT_RO_OPTION, + mount.substring(0, mount.lastIndexOf(':'))); + } else { + throw new IllegalArgumentException("Unsupported option:" + str); + } + } + + // Verify options + for (Map.Entry> option : additionalCommands + .entrySet()) { + String key = option.getKey(); + Set values = option.getValue(); + if (key.equals(DEVICE_OPTION)) { + continue; + } else if (key.equals(MOUNT_RO_OPTION)) { + for (String value : values) { + int idx = value.indexOf(':'); + value.substring(0, idx); + value.substring(idx + 1); + continue; + } + } else if (key.equals(VOLUME_DRIVER_OPTION)) { + if (values.size() > 1) { + throw new ContainerExecutionException( + "Only support at most one " + key); + } + } else { + throw new ContainerExecutionException("Unsupported option:" + key); + } + } + } catch (Exception e) { + LOG.warn("Exception of " + this.getClass().getSimpleName() + " init:", + e); + throw new ContainerExecutionException(e); + } + } + } + + @Override + public synchronized void updateDockerRunCommand( + DockerRunCommand dockerRunCommand, Container container) + throws ContainerExecutionException { + ResourceMappings resourceMappings = container.getResourceMappings(); + + List assignedResources = null; + if (resourceMappings != null) { + assignedResources = + resourceMappings.getAssignedResources(ResourceInformation.GPU_URI); + } + + if (assignedResources == null || assignedResources.isEmpty()) { + // When no GPU resource assigned, don't need to update docker command. + return; + } + + if (additionalCommands == null) { + init(); + } + + // Write to dockerRunCommand + for (Map.Entry> option : additionalCommands + .entrySet()) { + String key = option.getKey(); + Set values = option.getValue(); + if (key.equals(DEVICE_OPTION)) { + for (String value : values) { + dockerRunCommand.addDevice(value, value); + } + } else if (key.equals(MOUNT_RO_OPTION)) { + for (String value : values) { + int idx = value.indexOf(':'); + String source = value.substring(0, idx); + String target = value.substring(idx + 1); + dockerRunCommand.addReadOnlyMountLocation(source, target, true); + } + } else if (key.equals(VOLUME_DRIVER_OPTION)) { + for (String value : values) { + dockerRunCommand.setVolumeDriver(value); + } + } else { + throw new ContainerExecutionException("Unsupported option:" + key); + } + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index 9576ce7fec2..1fb35db67b7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -24,17 +24,20 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; public class GpuResourcePlugin implements ResourcePlugin { private ResourceHandler gpuResourceHandler = null; private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; + private GpuDockerCommandPlugin dockerCommandPlugin = null; @Override public synchronized void initialize(Context context) throws YarnException { resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); GpuDiscoverer.getInstance().initialize(context.getConf()); + dockerCommandPlugin = new GpuDockerCommandPlugin(context.getConf()); } @Override @@ -58,4 +61,8 @@ public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() { public void cleanup() throws YarnException { // Do nothing. } + + public DockerCommandPlugin getDockerCommandPluginInstance() { + return dockerCommandPlugin; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c index 860320d907d..b76e45f22ac 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c @@ -206,6 +206,8 @@ const char *get_docker_error_message(const int error_code) { return "Mount access error"; case INVALID_DOCKER_DEVICE: return "Invalid docker device"; + case INVALID_DOCKER_VOLUME_DRIVER: + return "Invalid docker volume-driver"; default: return "Unknown error"; } @@ -532,6 +534,24 @@ static int set_hostname(const struct configuration *command_config, char *out, c return add_param_to_command(command_config, "hostname", "--hostname=", 1, out, outlen); } +static int set_volume_driver(const struct configuration *command_config, + const struct configuration *conf, + char *out, const size_t outlen) { + int ret = 0; + ret = add_param_to_command_if_allowed(command_config, conf, "volume-driver", + "docker.allowed.volume-drivers", + "--volume-driver=", + 0, 0, out, outlen); + if (ret != 0) { + fprintf(ERRORFILE, + "Could not find requested volume-driver in allowed volume-drivers\n"); + ret = INVALID_DOCKER_VOLUME_DRIVER; + memset(out, 0, outlen); + } + + return ret; +} + static int set_group_add(const struct configuration *command_config, char *out, const size_t outlen) { int i = 0, ret = 0; char **group_add = get_configuration_values_delimiter("group-add", DOCKER_COMMAND_FILE_SECTION, command_config, ","); @@ -929,6 +949,11 @@ int get_docker_run_command(const char *command_file, const struct configuration return ret; } + ret = set_volume_driver(&command_config, conf, out, outlen); + if (ret != 0) { + return ret; + } + ret = add_ro_mounts(&command_config, conf, out, outlen); if (ret != 0) { return ret; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h index 37ec88077c5..3f3b0fe07c6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h @@ -49,7 +49,8 @@ enum docker_error_codes { INVALID_DOCKER_RW_MOUNT, MOUNT_ACCESS_ERROR, INVALID_DOCKER_DEVICE, - INVALID_DOCKER_STOP_COMMAND + INVALID_DOCKER_STOP_COMMAND, + INVALID_DOCKER_VOLUME_DRIVER }; /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc index c627ca84e4f..cb6e42f641f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc @@ -1119,4 +1119,58 @@ namespace ContainerExecutor { } } + TEST_F(TestDockerUtil, test_set_volume_driver) { + struct configuration container_cfg; + const int buff_len = 1024; + char buff[buff_len]; + int ret = 0; + std::string container_executor_cfg_contents = "[docker]\n docker.allowed.volume-drivers=driver-1,driver-2"; + std::vector > file_cmd_vec; + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n volume-driver=driver-1", "--volume-driver='driver-1' ")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n volume-driver=driver-2", "--volume-driver='driver-2' ")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run", "")); + write_container_executor_cfg(container_executor_cfg_contents); + ret = read_config(container_executor_cfg_file.c_str(), &container_cfg); + + std::vector >::const_iterator itr; + if (ret != 0) { + FAIL(); + } + for (itr = file_cmd_vec.begin(); itr != file_cmd_vec.end(); ++itr) { + struct configuration cmd_cfg; + memset(buff, 0, buff_len); + write_command_file(itr->first); + ret = read_config(docker_command_file.c_str(), &cmd_cfg); + if (ret != 0) { + FAIL(); + } + ret = set_volume_driver(&cmd_cfg, &container_cfg, buff, buff_len); + ASSERT_EQ(0, ret); + ASSERT_STREQ(itr->second.c_str(), buff); + } + struct configuration cmd_cfg_1; + write_command_file("[docker-command-execution]\n docker-command=run\n volume-driver=driver-3"); + ret = read_config(docker_command_file.c_str(), &cmd_cfg_1); + if (ret != 0) { + FAIL(); + } + strcpy(buff, "test string"); + ret = set_volume_driver(&cmd_cfg_1, &container_cfg, buff, buff_len); + ASSERT_EQ(INVALID_DOCKER_VOLUME_DRIVER, ret); + ASSERT_EQ(0, strlen(buff)); + + container_executor_cfg_contents = "[docker]\n"; + write_container_executor_cfg(container_executor_cfg_contents); + ret = read_config(container_executor_cfg_file.c_str(), &container_cfg); + if (ret != 0) { + FAIL(); + } + strcpy(buff, "test string"); + ret = set_volume_driver(&cmd_cfg_1, &container_cfg, buff, buff_len); + ASSERT_EQ(INVALID_DOCKER_VOLUME_DRIVER, ret); + ASSERT_EQ(0, strlen(buff)); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestDockerContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestDockerContainerRuntime.java index fbfee545f5d..d96e5b1d892 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestDockerContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestDockerContainerRuntime.java @@ -29,6 +29,7 @@ import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; @@ -36,6 +37,9 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerModule; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeContext; @@ -1148,4 +1152,77 @@ public void testDockerHostnamePattern() throws Exception { } } } + + @Test + public void testDockerCommandPlugin() throws Exception { + DockerLinuxContainerRuntime runtime = + new DockerLinuxContainerRuntime(mockExecutor, mockCGroupsHandler); + + Context nmContext = mock(Context.class); + ResourcePluginManager rpm = mock(ResourcePluginManager.class); + Map pluginsMap = new HashMap<>(); + ResourcePlugin plugin1 = mock(ResourcePlugin.class); + + // Create the docker command plugin logic, which will set volume driver + DockerCommandPlugin dockerCommandPlugin = new DockerCommandPlugin() { + @Override + public void updateDockerRunCommand(DockerRunCommand dockerRunCommand, + Container container) throws ContainerExecutionException { + dockerRunCommand.setVolumeDriver("driver-1"); + } + }; + + when(plugin1.getDockerCommandPluginInstance()).thenReturn( + dockerCommandPlugin); + ResourcePlugin plugin2 = mock(ResourcePlugin.class); + pluginsMap.put("plugin1", plugin1); + pluginsMap.put("plugin2", plugin2); + + when(rpm.getNameToPlugins()).thenReturn(pluginsMap); + + when(nmContext.getResourcePluginManager()).thenReturn(rpm); + + runtime.initialize(conf); + + runtime.launchContainer(builder.build()); + PrivilegedOperation op = capturePrivilegedOperationAndVerifyArgs(); + List args = op.getArguments(); + String dockerCommandFile = args.get(11); + + List dockerCommands = Files.readAllLines(Paths.get + (dockerCommandFile), Charset.forName("UTF-8")); + + int expected = 14; + int counter = 0; + Assert.assertEquals(expected, dockerCommands.size()); + Assert.assertEquals("[docker-command-execution]", + dockerCommands.get(counter++)); + Assert.assertEquals(" cap-add=SYS_CHROOT,NET_BIND_SERVICE", + dockerCommands.get(counter++)); + Assert.assertEquals(" cap-drop=ALL", dockerCommands.get(counter++)); + Assert.assertEquals(" detach=true", dockerCommands.get(counter++)); + Assert.assertEquals(" docker-command=run", dockerCommands.get(counter++)); + Assert.assertEquals(" hostname=ctr-id", dockerCommands.get(counter++)); + Assert + .assertEquals(" image=busybox:latest", dockerCommands.get(counter++)); + Assert.assertEquals( + " launch-command=bash,/test_container_work_dir/launch_container.sh", + dockerCommands.get(counter++)); + Assert.assertEquals(" name=container_id", dockerCommands.get(counter++)); + Assert.assertEquals(" net=host", dockerCommands.get(counter++)); + Assert.assertEquals( + " rw-mounts=/test_container_local_dir:/test_container_local_dir," + + "/test_filecache_dir:/test_filecache_dir," + + "/test_container_work_dir:/test_container_work_dir," + + "/test_container_log_dir:/test_container_log_dir," + + "/test_user_local_dir:/test_user_local_dir", + dockerCommands.get(counter++)); + Assert.assertEquals(" user=run_as_user", dockerCommands.get(counter++)); + + // Verify volume-driver is set to expected value. + Assert.assertEquals(" volume-driver=driver-1", + dockerCommands.get(counter++)); + Assert.assertEquals(" workdir=/test_container_work_dir", + dockerCommands.get(counter++)); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestJavaSandboxLinuxContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestJavaSandboxLinuxContainerRuntime.java index bdd435eb2ae..24c8f1327dc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestJavaSandboxLinuxContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/TestJavaSandboxLinuxContainerRuntime.java @@ -55,7 +55,6 @@ import java.util.regex.Pattern; import static org.apache.hadoop.yarn.api.ApplicationConstants.Environment.JAVA_HOME; -import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.JavaSandboxLinuxContainerRuntime.NMContainerPolicyUtils.LOG; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.JavaSandboxLinuxContainerRuntime.NMContainerPolicyUtils.MULTI_COMMAND_REGEX; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.JavaSandboxLinuxContainerRuntime.NMContainerPolicyUtils.CLEAN_CMD_REGEX; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.JavaSandboxLinuxContainerRuntime.NMContainerPolicyUtils.CONTAINS_JAVA_CMD; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDockerCommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDockerCommandPlugin.java new file mode 100644 index 00000000000..18133ed4245 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDockerCommandPlugin.java @@ -0,0 +1,172 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import com.sun.net.httpserver.HttpExchange; +import com.sun.net.httpserver.HttpHandler; +import com.sun.net.httpserver.HttpServer; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestGpuDockerCommandPlugin { + private Map> copyCommandLine( + Map> map) { + Map> ret = new HashMap<>(); + for (Map.Entry> entry : map.entrySet()) { + ret.put(entry.getKey(), new ArrayList<>(entry.getValue())); + } + return ret; + } + + private boolean commandlinesEquals(Map> cli1, + Map> cli2) { + if (!Sets.symmetricDifference(cli1.keySet(), cli2.keySet()).isEmpty()) { + return false; + } + + for (String key : cli1.keySet()) { + List value1 = cli1.get(key); + List value2 = cli2.get(key); + if (!value1.equals(value2)) { + return false; + } + } + + return true; + } + + static class MyHandler implements HttpHandler { + String response = "This is the response"; + + @Override + public void handle(HttpExchange t) throws IOException { + t.sendResponseHeaders(200, response.length()); + OutputStream os = t.getResponseBody(); + os.write(response.getBytes()); + os.close(); + } + } + + @Test + public void testPlugin() throws Exception { + Configuration conf = new Configuration(); + + DockerRunCommand runCommand = new DockerRunCommand("container_1", "user", + "fakeimage"); + + Map> originalCommandline = copyCommandLine( + runCommand.getDockerCommandWithArguments()); + + GpuDockerCommandPlugin commandPlugin = new GpuDockerCommandPlugin(conf); + + Container nmContainer = mock(Container.class); + + // getResourceMapping is null, so commandline won't be updated + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // no GPU resource assigned, so commandline won't be updated + ResourceMappings resourceMappings = new ResourceMappings(); + when(nmContainer.getResourceMappings()).thenReturn(resourceMappings); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // Assign GPU resource, init will be invoked + ResourceMappings.AssignedResources assigned = + new ResourceMappings.AssignedResources(); + assigned.updateAssignedResources(ImmutableList.of("gpu1", "gpu2")); + resourceMappings.addAssignedResources(ResourceInformation.GPU_URI, assigned); + + // Since there's no HTTP server running, so we will see exception + boolean caughtException = false; + try { + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + } catch (ContainerExecutionException e) { + caughtException = true; + } + Assert.assertTrue(caughtException); + + // Start HTTP server + MyHandler handler = new MyHandler(); + HttpServer server = HttpServer.create(new InetSocketAddress(60111), 0); + server.createContext("/test", handler); + server.start(); + + String hostName = server.getAddress().getHostName(); + int port = server.getAddress().getPort(); + String httpUrl = "http://" + hostName + ":" + port + "/test"; + + conf.set(YarnConfiguration.NVIDIA_DOCKER_PLUGIN_ENDPOINT, httpUrl); + + commandPlugin = new GpuDockerCommandPlugin(conf); + + // Start use invalid options + handler.response = "INVALID_RESPONSE"; + try { + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + } catch (ContainerExecutionException e) { + caughtException = true; + } + Assert.assertTrue(caughtException); + + // Start use invalid options + handler.response = "INVALID_RESPONSE"; + try { + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + } catch (ContainerExecutionException e) { + caughtException = true; + } + Assert.assertTrue(caughtException); + + handler.response = "--device=/dev/nvidiactl --device=/dev/nvidia-uvm " + + "--device=/dev/nvidia0 --volume-driver=nvidia-docker " + + "--volume=nvidia_driver_352.68:/usr/local/nvidia:ro"; + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Map> newCommandLine = + runCommand.getDockerCommandWithArguments(); + + // Command line will be updated + Assert.assertFalse(commandlinesEquals(originalCommandline, newCommandLine)); + Assert.assertTrue(newCommandLine.containsKey("devices")); + Assert.assertTrue(newCommandLine.containsKey("volume-driver")); + Assert.assertTrue(newCommandLine.containsKey("ro-mounts")); + } +}