diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 2f25284..e1ac097 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1665,6 +1665,9 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NVIDIA_DOCKER_V1 = "nvidia-docker-v1"; @Private + public static final String NVIDIA_DOCKER_V2 = "nvidia-docker-v2"; + + @Private public static final String DEFAULT_NM_GPU_DOCKER_PLUGIN_IMPL = NVIDIA_DOCKER_V1; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java index aac8224..a3902cd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java @@ -159,6 +159,11 @@ public DockerRunCommand disableDetach() { return this; } + public DockerRunCommand addRuntime(String runtime) { + super.addCommandArguments("runtime", runtime); + return this; + } + public DockerRunCommand groupAdd(String[] groups) { super.addCommandArguments("group-add", String.join(",", groups)); return this; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java index db4589a..051afd6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java @@ -34,6 +34,10 @@ public static DockerCommandPlugin createGpuDockerCommandPlugin( if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V1)) { return new NvidiaDockerV1CommandPlugin(conf); } + // nvidia-docker2 + if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V2)) { + return new NvidiaDockerV2CommandPlugin(); + } throw new YarnException( "Unkown implementation name for Gpu docker plugin, impl=" + impl); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java index e69de29..ad23c63 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerVolumeCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; + +import java.io.Serializable; +import java.util.*; + +/** + * Implementation to use nvidia-docker v2 as GPU docker command plugin. + */ +public class NvidiaDockerV2CommandPlugin implements DockerCommandPlugin { + final static Log LOG = LogFactory.getLog(NvidiaDockerV2CommandPlugin.class); + + private String NVIDIA_RUNTIME = "nvidia"; + private String NVIDIA_VISIBLE_DEVICES = "NVIDIA_VISIBLE_DEVICES"; + + public NvidiaDockerV2CommandPlugin() {} + + private Set getAssignedGpus(Container container) { + ResourceMappings resourceMappings = container.getResourceMappings(); + + // Copy of assigned Resources + Set assignedResources = null; + if (resourceMappings != null) { + assignedResources = new HashSet<>(); + for (Serializable s : resourceMappings.getAssignedResources( + ResourceInformation.GPU_URI)) { + assignedResources.add((GpuDevice) s); + } + } + if (assignedResources == null || assignedResources.isEmpty()) { + // When no GPU resource assigned, don't need to update docker command. + return Collections.emptySet(); + } + return assignedResources; + } + + @VisibleForTesting + protected boolean requestsGpu(Container container) { + return GpuResourceAllocator.getRequestedGpus(container.getResource()) > 0; + } + + @Override + public synchronized void updateDockerRunCommand( + DockerRunCommand dockerRunCommand, Container container) + throws ContainerExecutionException { + if (!requestsGpu(container)) { + return; + } + Set assignedResources = getAssignedGpus(container); + if (assignedResources == null || assignedResources.isEmpty()) { + return; + } + Map environment = new HashMap<>(); + String gpuIndexList = ""; + for (GpuDevice gpuDevice : assignedResources) { + gpuIndexList = gpuIndexList + gpuDevice.getIndex() + ","; + LOG.info("nvidia docker2 assigned gpu index: " + gpuDevice.getIndex()); + } + dockerRunCommand.addRuntime(NVIDIA_RUNTIME); + environment.put(NVIDIA_VISIBLE_DEVICES, gpuIndexList.substring(0, gpuIndexList.length() - 1)); + dockerRunCommand.addEnv(environment); + } + + @Override + public DockerVolumeCommand getCreateDockerVolumeCommand(Container container) + throws ContainerExecutionException { + // No Volume needed for nvidia-docker2. + return null; + } + + @Override + public DockerVolumeCommand getCleanupDockerVolumesCommand(Container container) + throws ContainerExecutionException { + // No cleanup needed. + return null; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c index c0aa80b..6800505 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c @@ -893,6 +893,10 @@ static int set_hostname(const struct configuration *command_config, args *args) return add_param_to_command(command_config, "hostname", "--hostname=", 1, args); } +static int set_runtime(const struct configuration *command_config, args *args) { + return add_param_to_command(command_config, "runtime", "--runtime=", 1, args); +} + static int set_group_add(const struct configuration *command_config, args *args) { int i = 0, ret = 0; char **group_add = get_configuration_values_delimiter("group-add", DOCKER_COMMAND_FILE_SECTION, command_config, ","); @@ -1580,6 +1584,11 @@ int get_docker_run_command(const char *command_file, const struct configuration goto free_and_exit; } + ret = set_runtime(&command_config, args); + if (ret != 0) { + goto free_and_exit; + } + ret = set_hostname(&command_config, args); if (ret != 0) { goto free_and_exit; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc index 2817e0c..71bc2d1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc @@ -432,6 +432,16 @@ namespace ContainerExecutor { run_docker_run_helper_function(file_cmd_vec, set_hostname); } + TEST_F(TestDockerUtil, test_set_runtime) { + std::vector > file_cmd_vec; + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n runtime=nvidia", "--runtime=nvidia")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run", "")); + + run_docker_run_helper_function(file_cmd_vec, set_runtime); + } + TEST_F(TestDockerUtil, test_set_group_add) { std::vector > file_cmd_vec; file_cmd_vec.push_back(std::make_pair( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java index 8dc37d4..23483d3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java @@ -59,6 +59,7 @@ public void testCommandArguments() { dockerRunCommand.setOverrideCommandWithArgs(commands); dockerRunCommand.removeContainerOnExit(); dockerRunCommand.addTmpfsMount("/run"); + dockerRunCommand.addRuntime("nvidia"); assertEquals("run", StringUtils.join(",", dockerRunCommand.getDockerCommandWithArguments() @@ -79,7 +80,9 @@ public void testCommandArguments() { .get("launch-command"))); assertEquals("/run", StringUtils.join(",", dockerRunCommand.getDockerCommandWithArguments().get("tmpfs"))); - assertEquals(8, dockerRunCommand.getDockerCommandWithArguments().size()); + assertEquals("nvidia", StringUtils.join(",", + dockerRunCommand.getDockerCommandWithArguments().get("runtime"))); + assertEquals(9, dockerRunCommand.getDockerCommandWithArguments().size()); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java index e69de29..7183e52 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestNvidiaDockerV2CommandPlugin { + private Map> copyCommandLine( + Map> map) { + Map> ret = new HashMap<>(); + for (Map.Entry> entry : map.entrySet()) { + ret.put(entry.getKey(), new ArrayList<>(entry.getValue())); + } + return ret; + } + + private boolean commandlinesEquals(Map> cli1, + Map> cli2) { + if (!Sets.symmetricDifference(cli1.keySet(), cli2.keySet()).isEmpty()) { + return false; + } + + for (String key : cli1.keySet()) { + List value1 = cli1.get(key); + List value2 = cli2.get(key); + if (!value1.equals(value2)) { + return false; + } + } + + return true; + } + + static class MyNvidiaDockerV2CommandPlugin + extends NvidiaDockerV2CommandPlugin { + private boolean requestsGpu = false; + + public MyNvidiaDockerV2CommandPlugin() { + } + + public void setRequestsGpu(boolean r) { + requestsGpu = r; + } + + @Override + protected boolean requestsGpu(Container container) { + return requestsGpu; + } + } + + @Test + public void testPlugin() throws Exception { + DockerRunCommand runCommand = new DockerRunCommand("container_1", "user", + "fakeimage"); + + Map> originalCommandline = copyCommandLine( + runCommand.getDockerCommandWithArguments()); + + MyNvidiaDockerV2CommandPlugin + commandPlugin = new MyNvidiaDockerV2CommandPlugin(); + + Container nmContainer = mock(Container.class); + + // getResourceMapping is null, so commandline won't be updated + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // no GPU resource assigned, so commandline won't be updated + ResourceMappings resourceMappings = new ResourceMappings(); + when(nmContainer.getResourceMappings()).thenReturn(resourceMappings); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // Assign GPU resource + ResourceMappings.AssignedResources assigned = + new ResourceMappings.AssignedResources(); + assigned.updateAssignedResources( + ImmutableList.of(new GpuDevice(0, 0), new GpuDevice(1, 1))); + resourceMappings.addAssignedResources(ResourceInformation.GPU_URI, + assigned); + + commandPlugin.setRequestsGpu(true); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Map> newCommandLine = + runCommand.getDockerCommandWithArguments(); + + // Command line will be updated + Assert.assertFalse(commandlinesEquals(originalCommandline, newCommandLine)); + // NVIDIA_VISIBLE_DEVICES will be set + Assert.assertTrue(runCommand.getEnv().get("NVIDIA_VISIBLE_DEVICES").equals("0,1")); + // runtime should exist + Assert.assertTrue(newCommandLine.containsKey("runtime")); + } +}