commit 0a97dcaab8097f37156e09413a7e88c6522357d4 Author: Wangda Tan Date: Tue Jun 27 16:06:09 2017 -0700 added gtest for cgroups, added mock func for gpu module, pending: 1. put all configurations to section (July 14 001) Change-Id: Ie2356b23c01f41a83e7ef56f083736caf7180bd5 (cherry picked from commit c965fc63e2c9c76da84c00fd3bbba44648bba1e1) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index fa4d2e3d321..7ac7da5856e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1251,6 +1251,31 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT = NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit"; + /** + * Prefix for disk configurations. Work in progress: This configuration + * parameter may be changed/removed in the future. + */ + @Private + public static final String NM_GPU_RESOURCE_PREFIX = NM_PREFIX + + "resource.gpu."; + /** + * This setting controls if resource handling for GPU operations is enabled. + */ + @Private + public static final String NM_GPU_RESOURCE_ENABLED = + NM_GPU_RESOURCE_PREFIX + "enabled"; + + @Private + public static final String NM_GPU_ALLOWED_DEVICES = + NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices"; + + /** + * Disk as a resource is disabled by default. + **/ + @Private + public static final boolean DEFAULT_NM_GPU_RESOURCE_ENABLED = false; + + /** NM Webapp address.**/ public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address"; public static final int DEFAULT_NM_WEBAPP_PORT = 8042; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml index 2f790ec4bbe..ec77386dcea 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml @@ -29,7 +29,7 @@ ${project.parent.parent.basedir} - ../etc/hadoop + /etc/hadoop/conf diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/CMakeLists.txt b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/CMakeLists.txt index 09f60de9897..a0b9982137f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/CMakeLists.txt +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/CMakeLists.txt @@ -100,6 +100,11 @@ add_library(container main/native/container-executor/impl/configuration.c main/native/container-executor/impl/container-executor.c main/native/container-executor/impl/get_executable.c + main/native/container-executor/impl/utils/string-utils.c + main/native/container-executor/impl/utils/command-line-parser.c + main/native/container-executor/impl/modules/cgroups/cgroups-operations.c + main/native/container-executor/impl/modules/common/module-configs.c + main/native/container-executor/impl/modules/gpu/gpu-module.c ) add_executable(container-executor @@ -112,12 +117,14 @@ target_link_libraries(container-executor output_directory(container-executor target/usr/local/bin) +# Test cases add_executable(test-container-executor main/native/container-executor/test/test-container-executor.c ) target_link_libraries(test-container-executor container ${EXTRA_LIBS} ) + output_directory(test-container-executor target/usr/local/bin) # unit tests for container executor @@ -125,6 +132,8 @@ add_executable(cetest main/native/container-executor/impl/util.c main/native/container-executor/test/test_configuration.cc main/native/container-executor/test/test_main.cc + main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc + main/native/container-executor/test/modules/gpu/test-gpu-module.cc main/native/container-executor/test/test_util.cc) -target_link_libraries(cetest gtest) +target_link_libraries(cetest gtest container) output_directory(cetest test) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 1cff53f2116..11d3d919189 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -63,6 +63,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.GpuResourceAllocator; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -70,6 +71,7 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators.LocalResourceAllocators; import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; @@ -406,6 +408,13 @@ protected void serviceInit(Configuration conf) throws Exception { addService(nodeStatusUpdater); ((NMContext) context).setNodeStatusUpdater(nodeStatusUpdater); + // Initialize local allocators + if (conf.getBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, + YarnConfiguration.DEFAULT_NM_GPU_RESOURCE_ENABLED)) { + LocalResourceAllocators.setGpuResourceAllocator( + new GpuResourceAllocator(context)); + } + super.serviceInit(conf); // TODO add local dirs to del } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index bd3f06d1fcb..1370a02abcd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -94,4 +94,6 @@ void sendKillEvent(int exitStatus, String description); boolean isRecovering(); + + ResourceMappings getResourceMappings(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 055e12c57d6..534da1ee900 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -181,6 +181,7 @@ private ReInitializationContext createContextForRollback() { private boolean recoveredAsKilled = false; private Context context; private ResourceSet resourceSet; + private ResourceMappings resourceMappings; public ContainerImpl(Configuration conf, Dispatcher dispatcher, ContainerLaunchContext launchContext, Credentials creds, @@ -229,6 +230,7 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, stateMachine = stateMachineFactory.make(this); this.context = context; this.resourceSet = new ResourceSet(); + this.resourceMappings = new ResourceMappings(); } private static ContainerRetryContext configureRetryContext( @@ -276,6 +278,7 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, this.remainingRetryAttempts = rcs.getRemainingRetryAttempts(); this.workDir = rcs.getWorkDir(); this.logDir = rcs.getLogDir(); + this.resourceMappings = rcs.getResourceMappings(); } private static final ContainerDiagnosticsUpdateTransition UPDATE_DIAGNOSTICS_TRANSITION = @@ -1764,4 +1767,9 @@ public boolean isRecovering() { getContainerState() == ContainerState.NEW); return isRecovering; } + + @Override + public ResourceMappings getResourceMappings() { + return resourceMappings; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java new file mode 100644 index 00000000000..d227ae385cd --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.container; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This class is added to store assigned resource to a single container by resource + * types. + * + * Assigned resource could be list of String + * + * For example, we can assign container to: + * "numa": ["numa0"] + * "gpu": ["0", "1", "2", "3"] + * "fpga": ["1", "3"] + * + * This will be used for NM restart container recovery + */ +public class ResourceMappings { + /** + * Record resources assigned to a container for a given resource type + */ + public static class AssignedResources implements Serializable { + private List list = Collections.emptyList(); + + public List getAssignedResources() { + return Collections.unmodifiableList(list); + } + + public void updateAssignedResources(List list) { + this.list = new ArrayList<>(list); + } + + @SuppressWarnings("unchecked") + public static AssignedResources fromBytes(byte[] bytes) + throws IOException { + ByteArrayInputStream bis = new ByteArrayInputStream(bytes); + ObjectInputStream ois = new ObjectInputStream(bis); + List stringArray; + try { + stringArray = (List) ois.readObject(); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + AssignedResources ar = new AssignedResources(); + ar.updateAssignedResources(stringArray); + return ar; + } + + public byte[] toBytes() throws IOException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(bos); + oos.writeObject(list); + byte[] bytes = bos.toByteArray(); + return bytes; + } + } + + private Map assignedResourcesMap = new HashMap<>(); + + /** + * Get all resource mappings. + * @return map of resource mapping + */ + public List getAssignedResources(String resourceType) { + AssignedResources ar = assignedResourcesMap.get(resourceType); + if (null == ar) { + return Collections.emptyList(); + } + return ar.getAssignedResources(); + } + + public void addAssignedResources(String resourceType, + AssignedResources assigned) { + assignedResourcesMap.put(resourceType, assigned); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java index 8402a16339d..7e3a53a778b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java @@ -51,6 +51,7 @@ TC_READ_STATS("--tc-read-stats"), ADD_PID_TO_CGROUP(""), //no CLI switch supported yet. RUN_DOCKER_CMD("--run-docker"), + GPU("gpu"), LIST_AS_USER(""); //no CLI switch supported yet. private final String option; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/GpuResourceAllocator.java new file mode 100644 index 00000000000..9a34f08bf47 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/GpuResourceAllocator.java @@ -0,0 +1,179 @@ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +/** + * Allocate GPU resources according to requirements + */ +public class GpuResourceAllocator { + final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); + private final static String RESOURCE_TYPE = "gpu"; + + private Set allowedGpuDevices = new TreeSet<>(); + private Map usedDevices = new TreeMap<>(); + private Context nmContext; + + public GpuResourceAllocator(Context ctx) { + this.nmContext = ctx; + } + + /** + * Contains allowed and denied devices with minor number. + * Denied devices will be useful for cgroups devices module to do blacklisting + */ + static class GpuAllocation { + private Set allowed = Collections.emptySet(); + private Set denied = Collections.emptySet(); + + GpuAllocation(Set allowed, Set denied) { + if (allowed != null) { + this.allowed = ImmutableSet.copyOf(allowed); + } + if (denied != null) { + this.denied = ImmutableSet.copyOf(denied); + } + } + + public Set getAllowedGPUs() { + return allowed; + } + + public Set getDeniedGPUs() { + return denied; + } + } + + /** + * Add GPU to allowed list + * @param minorNumber minor number of the GPU device. + */ + public synchronized void addGpu(int minorNumber) { + allowedGpuDevices.add(minorNumber); + } + + private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices, + ContainerId containerId) { + return "Failed to find enough GPUs, requestor=" + containerId + + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus=" + + getAvailableGpus(); + } + + @VisibleForTesting + public synchronized int getAvailableGpus() { + return allowedGpuDevices.size() - usedDevices.size(); + } + + public synchronized void recoverAssignedGpus(ContainerId containerId) + throws ResourceHandlerException { + Container c = nmContext.getContainers().get(containerId); + + for (String dev : c.getResourceMappings().getAssignedResources( + RESOURCE_TYPE)){ + int devId = Integer.valueOf(dev); + + // Make sure it is in allowed GPU device. + if (!allowedGpuDevices.contains(devId)) { + throw new ResourceHandlerException("Try to recover device id = " + devId + + " however it is not in allowed device list:" + StringUtils + .join(",", allowedGpuDevices)); + } + + // Make sure it is not occupied by anybody else + if (usedDevices.containsKey(devId)) { + throw new ResourceHandlerException("Try to recover device id = " + devId + + " however it is already assigned to container=" + usedDevices + .get(devId) + ", please double check what happened."); + } + + usedDevices.put(devId, containerId); + } + } + + /** + * Assign GPU to requestor + * @param numRequestedGpuDevices How many GPU to request + * @param containerId who is requesting the resources + * @return List of denied Gpus with minor numbers + * @throws ResourceHandlerException When failed to + */ + public synchronized GpuAllocation assignGpus(int numRequestedGpuDevices, + ContainerId containerId) throws ResourceHandlerException { + // Assign Gpus to container if requested some. + if (numRequestedGpuDevices > 0) { + if (numRequestedGpuDevices > getAvailableGpus()) { + throw new ResourceHandlerException( + getResourceHandlerExceptionMessage(numRequestedGpuDevices, + containerId)); + } + + Set assignedGpus = new HashSet<>(); + + for (int deviceNum : allowedGpuDevices) { + if (!usedDevices.containsKey(deviceNum)) { + usedDevices.put(deviceNum, containerId); + assignedGpus.add(deviceNum); + if (assignedGpus.size() == numRequestedGpuDevices) { + break; + } + } + } + + // Record in state store if we allocated anything + if (!assignedGpus.isEmpty()) { + List allocatedDevices = new ArrayList<>(); + for (int gpu : assignedGpus) { + allocatedDevices.add(String.valueOf(gpu)); + } + try { + nmContext.getNMStateStore().storeAssignedResources(containerId, + RESOURCE_TYPE, allocatedDevices); + } catch (IOException e) { + throw new ResourceHandlerException(e); + } + } + + return new GpuAllocation(assignedGpus, + Sets.difference(allowedGpuDevices, assignedGpus)); + } + return new GpuAllocation(null, allowedGpuDevices); + } + + /** + * Clean up all Gpus assigned to containerId + * @param containerId containerId + */ + public synchronized void cleanupAssignGpus(ContainerId containerId) { + Iterator> iter = + usedDevices.entrySet().iterator(); + while (iter.hasNext()) { + if (iter.next().getValue().equals(containerId)) { + iter.remove(); + } + } + } + + @VisibleForTesting + public synchronized Map getDeviceAllocationMapping() { + return new HashMap<>(usedDevices); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/GpuResourceHandlerImpl.java new file mode 100644 index 00000000000..be270b73822 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/GpuResourceHandlerImpl.java @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators.LocalResourceAllocators; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class GpuResourceHandlerImpl implements ResourceHandler { + final static Log LOG = LogFactory + .getLog(GpuResourceHandlerImpl.class); + + private final String REQUEST_GPU_NUM_ENV_KEY = "REQUESTED_GPU_NUM"; + + // This will be used by container-executor to add necessary clis + public static final String ALLOCATED_GPU_MINOR_NUMS_ENV_KEY = + "YARN_ALLOCATED_GPUS"; + public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus"; + public static final String CONTAINER_ID_CLI_OPTION = "--container_id"; + + private GpuResourceAllocator gpuAllocator; + private CGroupsHandler cGroupsHandler; + private PrivilegedOperationExecutor privilegedOperationExecutor; + + GpuResourceHandlerImpl(CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor, + GpuResourceAllocator gpuResourceAllocator) { + this.cGroupsHandler = cGroupsHandler; + this.privilegedOperationExecutor = privilegedOperationExecutor; + gpuAllocator = gpuResourceAllocator; + } + + @Override + public List bootstrap(Configuration configuration) + throws ResourceHandlerException { + String allowedDevicesStr = configuration.get( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES); + + if (null != allowedDevicesStr) { + for (String s : allowedDevicesStr.split(",")) { + if (s.trim().length() > 0) { + Integer minorNum = Integer.valueOf(s.trim()); + gpuAllocator.addGpu(minorNum); + } + } + } + LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr); + + // And initialize cgroups + this.cGroupsHandler.initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + + return null; + } + + private int getRequestedGpu(Container container) { + // TODO, use YARN-3926 after it merged + ContainerLaunchContext clc = container.getLaunchContext(); + Map envs = clc.getEnvironment(); + if (null != envs.get(REQUEST_GPU_NUM_ENV_KEY)) { + return Integer.valueOf(envs.get(REQUEST_GPU_NUM_ENV_KEY)); + } + return 0; + } + + @Override + public synchronized List preStart(Container container) + throws ResourceHandlerException { + String containerIdStr = container.getContainerId().toString(); + + int requestedGpu = getRequestedGpu(container); + + // Assign Gpus to container if requested some. + GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus( + requestedGpu, container.getContainerId()); + + // Create device cgroups for the container + cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + try { + // Execute c-e to setup GPU isolation before launch the container + PrivilegedOperation privilegedOperation = new PrivilegedOperation( + PrivilegedOperation.OperationType.GPU, Arrays + .asList(CONTAINER_ID_CLI_OPTION, containerIdStr)); + if (!allocation.getDeniedGPUs().isEmpty()) { + privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", allocation.getDeniedGPUs()))); + } + + privilegedOperationExecutor.executePrivilegedOperation( + privilegedOperation, true); + } catch (PrivilegedOperationException e) { + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + LOG.warn("Could not update cgroup for container", e); + throw new ResourceHandlerException(e); + } + + // Set ALLOCATED_GPU_MINOR_NUMS_ENV_KEY to environment so later runtime + // can use it. + Map envs = container.getLaunchContext().getEnvironment(); + if (null == allocation.getAllowedGPUs() || allocation.getAllowedGPUs().isEmpty()) { + envs.put(ALLOCATED_GPU_MINOR_NUMS_ENV_KEY, ""); + } else { + envs.put(ALLOCATED_GPU_MINOR_NUMS_ENV_KEY, + StringUtils.join(",", allocation.getAllowedGPUs())); + } + + List ret = new ArrayList<>(); + ret.add(new PrivilegedOperation( + PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + PrivilegedOperation.CGROUP_ARG_PREFIX + + cGroupsHandler.getPathForCGroupTasks( + CGroupsHandler.CGroupController.DEVICES, containerIdStr))); + + return ret; + } + + @VisibleForTesting + public GpuResourceAllocator getGpuAllocator() { + return gpuAllocator; + } + + @Override + public List reacquireContainer(ContainerId containerId) + throws ResourceHandlerException { + gpuAllocator.recoverAssignedGpus(containerId); + return null; + } + + @Override + public synchronized List postComplete( + ContainerId containerId) throws ResourceHandlerException { + gpuAllocator.cleanupAssignGpus(containerId); + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerId.toString()); + return null; + } + + @Override + public List teardown() throws ResourceHandlerException { + return null; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java index 7fc04bdb41e..ff5ac69c450 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java @@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators.LocalResourceAllocators; import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler; import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler; @@ -87,6 +88,19 @@ public static CGroupsHandler getCGroupsHandler() { return cGroupsHandler; } + private static GpuResourceHandlerImpl getGpuResourceHandler( + Configuration conf) throws ResourceHandlerException { + boolean cgroupsGpuEnabled = conf.getBoolean( + YarnConfiguration.NM_GPU_RESOURCE_ENABLED, + YarnConfiguration.DEFAULT_NM_GPU_RESOURCE_ENABLED); + if (cgroupsGpuEnabled) { + return new GpuResourceHandlerImpl(getInitializedCGroupsHandler(conf), + PrivilegedOperationExecutor.getInstance(conf), + LocalResourceAllocators.getGpuResourceAllocator()); + } + return null; + } + private static CGroupsCpuResourceHandlerImpl getCGroupsCpuResourceHandler( Configuration conf) throws ResourceHandlerException { boolean cgroupsCpuEnabled = @@ -205,6 +219,7 @@ private static void initializeConfiguredResourceHandlerChain( addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf)); addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf)); addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf)); + addHandlerIfNotNull(handlerList, getGpuResourceHandler(conf)); resourceHandlerChain = new ResourceHandlerChain(handlerList); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java index 6b66b7b1358..b6590f822c8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java @@ -38,6 +38,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.GpuResourceHandlerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerModule; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerClient; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerInspectCommand; @@ -454,6 +455,18 @@ public void launchContainer(ContainerRuntimeContext ctx) .setCapabilities(capabilities) .addMountLocation(CGROUPS_ROOT_DIRECTORY, CGROUPS_ROOT_DIRECTORY + ":ro", false); + + // Handle GPU-related configs + String allocatedGpus = environment.get( + GpuResourceHandlerImpl.ALLOCATED_GPU_MINOR_NUMS_ENV_KEY); + if (null != allocatedGpus && !allocatedGpus.isEmpty()) { + // We allocated some GPUs, set YARN_CONTAINER_EXECUTOR_GPU_ENABLED + // to true. + runCommand.addDockerRunEnvars( + "YARN_CONTAINER_EXECUTOR_GPU_ENABLED", "true"); + runCommand.addDockerRunEnvars("PATH", "$PATH:/usr/local/nvidia/bin"); + } + List allDirs = new ArrayList<>(containerLocalDirs); allDirs.addAll(filecacheDirs); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java index f79f4ed08c8..aab44de0a25 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java @@ -31,6 +31,7 @@ private static final String RUN_COMMAND = "run"; private final String image; private List overrrideCommandWithArgs; + private List additionalEnvars = new ArrayList<>(); /** The following are mandatory: */ public DockerRunCommand(String containerId, String user, String image) { @@ -114,6 +115,10 @@ public DockerRunCommand setOverrideCommandWithArgs( return this; } + public void addDockerRunEnvars(String key, String value) { + additionalEnvars.add(key + "=" + value); + } + @Override public String getCommandWithArguments() { List argList = new ArrayList<>(); @@ -125,6 +130,10 @@ public String getCommandWithArguments() { argList.addAll(overrrideCommandWithArgs); } - return StringUtils.join(" ", argList); + String cmd = StringUtils.join(" ", argList); + for (String envar : additionalEnvars) { + cmd = cmd +"\n" + envar; + } + return cmd; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java index ab23456cd1a..8fd031ce1b3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java @@ -37,6 +37,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Time; import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl; @@ -57,6 +58,7 @@ import org.apache.hadoop.yarn.proto.YarnServiceProtos.StartContainerRequestProto; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.records.Version; import org.apache.hadoop.yarn.server.records.impl.pb.VersionPBImpl; import org.apache.hadoop.yarn.server.utils.LeveldbIterator; @@ -135,6 +137,9 @@ private static final String LOG_DELETER_KEY_PREFIX = "LogDeleters/"; + private static final String CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX = + "/assignedResources_"; + private static final byte[] EMPTY_VALUE = new byte[0]; private DB db; @@ -275,6 +280,13 @@ private RecoveredContainerState loadContainerState(ContainerId containerId, rcs.setWorkDir(asString(entry.getValue())); } else if (suffix.equals(CONTAINER_LOG_DIR_KEY_SUFFIX)) { rcs.setLogDir(asString(entry.getValue())); + } else if (suffix.startsWith(CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX)) { + String resourceType = suffix.substring( + CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX.length()); + ResourceMappings.AssignedResources assignedResources = + ResourceMappings.AssignedResources.fromBytes(entry.getValue()); + rcs.getResourceMappings().addAssignedResources(resourceType, + assignedResources); } else { LOG.warn("the container " + containerId + " will be killed because of the unknown key " + key @@ -1074,6 +1086,33 @@ public void removeLogDeleter(ApplicationId appId) throws IOException { } } + @Override + public void storeAssignedResources(ContainerId containerId, + String resourceType, List assignedResources) throws IOException { + if (LOG.isDebugEnabled()) { + LOG.debug("storeAssignedResources: containerId=" + containerId + + ", assignedResources=" + StringUtils.join(",", assignedResources)); + } + + String keyResChng = CONTAINERS_KEY_PREFIX + containerId.toString() + + CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX + resourceType; + try { + WriteBatch batch = db.createWriteBatch(); + try { + ResourceMappings.AssignedResources res = new ResourceMappings.AssignedResources(); + res.updateAssignedResources(assignedResources); + + // New value will overwrite old values for the same key + batch.put(bytes(keyResChng), res.toBytes()); + db.write(batch); + } finally { + batch.close(); + } + } catch (DBException e) { + throw new IOException(e); + } + } + @SuppressWarnings("deprecation") private void cleanupDeprecatedFinishedApps() { try { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java index 4bcdf5cd7c5..9025a6e9ff8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java @@ -229,6 +229,11 @@ public void removeLogDeleter(ApplicationId appId) throws IOException { } @Override + public void storeAssignedResources(ContainerId containerId, + String resourceType, List assignedResources) throws IOException { + } + + @Override protected void initStorage(Configuration conf) throws IOException { } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java index 9dd1eb06fe9..8109dff4993 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java @@ -42,6 +42,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LocalizedResourceProto; import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto; import org.apache.hadoop.yarn.server.api.records.MasterKey; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; @Private @Unstable @@ -87,6 +88,7 @@ public NMStateStoreService(String name) { int version; private RecoveredContainerType recoveryType = RecoveredContainerType.RECOVER; + ResourceMappings resMappings = new ResourceMappings(); public RecoveredContainerStatus getStatus() { return status; @@ -162,6 +164,10 @@ public RecoveredContainerType getRecoveryType() { public void setRecoveryType(RecoveredContainerType recoveryType) { this.recoveryType = recoveryType; } + + public ResourceMappings getResourceMappings() { + return resMappings; + } } public static class LocalResourceTrackerState { @@ -601,6 +607,8 @@ public abstract void storeLogDeleter(ApplicationId appId, public abstract void removeLogDeleter(ApplicationId appId) throws IOException; + public abstract void storeAssignedResources(ContainerId containerId, + String resourceType, List assignedResources) throws IOException; protected abstract void initStorage(Configuration conf) throws IOException; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/scheduler/allocators/LocalResourceAllocators.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/scheduler/allocators/LocalResourceAllocators.java new file mode 100644 index 00000000000..3da3515a9d5 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/scheduler/allocators/LocalResourceAllocators.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.GpuResourceAllocator; + +/** + * Manages local resource allocators such as GPU resource allocator. + */ +public class LocalResourceAllocators { + private static GpuResourceAllocator gpuResourceAllocator; + + public static GpuResourceAllocator getGpuResourceAllocator() { + return gpuResourceAllocator; + } + + public static void setGpuResourceAllocator(GpuResourceAllocator allocator) { + gpuResourceAllocator = allocator; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c index 3625d261a93..988091c178c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c @@ -80,6 +80,7 @@ static const char* TC_READ_STATS_OPTS [] = { "-s", "-b", NULL}; struct passwd *user_detail = NULL; FILE* LOGFILE = NULL; + FILE* ERRORFILE = NULL; static uid_t nm_uid = -1; @@ -1301,8 +1302,7 @@ char* sanitize_docker_command(const char *line) { return output; } -char* parse_docker_command_file(const char* command_file) { - +char* parse_docker_command_file(const char* command_file, char** docker_run_envars) { size_t len = 0; char *line = NULL; ssize_t read; @@ -1319,6 +1319,31 @@ char* parse_docker_command_file(const char* command_file) { fflush(ERRORFILE); exit(ERROR_READING_DOCKER_FILE); } + if (docker_run_envars) { + // Read envars to "k1=v1 k2=v2 ..." form. + int envLen = 0; + char* env = NULL; + + while ((read = getline(&env, &len, stream)) > 0) { + envLen += (read + 1); + + *docker_run_envars = realloc(*docker_run_envars, envLen); + if (envLen == read + 1) { + // initialize array for the first use + (*docker_run_envars)[0] = 0; + } + + if (NULL == *docker_run_envars) { + fprintf(ERRORFILE, "failed to allocate memory for docker run envars"); + } + + // Escape last delim + if ('\n' == env[read - 1]) { + env[read - 1] = ' '; + } + strcat(*docker_run_envars, env); + } + } fclose(stream); char* ret = sanitize_docker_command(line); @@ -1332,7 +1357,7 @@ char* parse_docker_command_file(const char* command_file) { } int run_docker(const char *command_file) { - char* docker_command = parse_docker_command_file(command_file); + char* docker_command = parse_docker_command_file(command_file, NULL); char* docker_binary = get_section_value(DOCKER_BINARY_KEY, &executor_cfg); docker_binary = check_docker_binary(docker_binary); @@ -1498,8 +1523,19 @@ int launch_docker_container_as_user(const char * user, const char *app_id, gid_t user_gid = getegid(); uid_t prev_uid = geteuid(); - char *docker_command = parse_docker_command_file(command_file); - char *docker_binary = get_section_value(DOCKER_BINARY_KEY, &executor_cfg); + // Additional envars for docker run .. + char *additional_envs = NULL; + + char *docker_command = parse_docker_command_file(command_file, &additional_envs); + char *docker_binary; + char *nvidia_docker_binary = NULL; + + if (additional_envs && + (NULL != strstr(additional_envs, GPU_ENABLED_ENV_VAR))) { + nvidia_docker_binary = get_section_value(NVIDIA_DOCKER_BINARY_KEY, &executor_cfg); + } + + docker_binary = get_section_value(DOCKER_BINARY_KEY, &executor_cfg); docker_binary = check_docker_binary(docker_binary); fprintf(LOGFILE, "Creating script paths...\n"); @@ -1539,7 +1575,12 @@ int launch_docker_container_as_user(const char * user, const char *app_id, goto cleanup; } - snprintf(docker_command_with_binary, EXECUTOR_PATH_MAX, "%s %s", docker_binary, docker_command); + snprintf(docker_command_with_binary, EXECUTOR_PATH_MAX, "%s %s %s", + additional_envs, + // Only use nvidia-docker in run if it is required + (nvidia_docker_binary ? nvidia_docker_binary : docker_binary), + docker_command); + fprintf(LOGFILE, "Docker run command, %s", docker_command_with_binary); fprintf(LOGFILE, "Launching docker container...\n"); FILE* start_docker = popen(docker_command_with_binary, "r"); @@ -2330,3 +2371,12 @@ int traffic_control_read_state(char *command_file) { int traffic_control_read_stats(char *command_file) { return run_traffic_control(TC_READ_STATS_OPTS, command_file); } + +/** + * FIXME: (wangda) it's better to move executor_cfg out of container-executor.c + * Now initialize of executor_cfg and data structures are stored inside + * container-executor which is not a good design. + */ +struct configuration* get_cfg() { + return &executor_cfg; +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h index 118c6cf7257..027e0b1442a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h @@ -47,7 +47,8 @@ enum operations { RUN_AS_USER_DELETE = 9, RUN_AS_USER_LAUNCH_DOCKER_CONTAINER = 10, RUN_DOCKER = 11, - RUN_AS_USER_LIST = 12 + RUN_AS_USER_LIST = 12, + UPDATE_CGROUPS_PARAM = 13, }; #define NM_GROUP_KEY "yarn.nodemanager.linux-container-executor.group" @@ -60,8 +61,10 @@ enum operations { #define BANNED_USERS_KEY "banned.users" #define ALLOWED_SYSTEM_USERS_KEY "allowed.system.users" #define DOCKER_BINARY_KEY "docker.binary" +#define NVIDIA_DOCKER_BINARY_KEY "nvidia_docker.binary" #define DOCKER_SUPPORT_ENABLED_KEY "feature.docker.enabled" #define TC_SUPPORT_ENABLED_KEY "feature.tc.enabled" +#define GPU_ENABLED_ENV_VAR "YARN_CONTAINER_EXECUTOR_GPU_ENABLED=true" #define TMP_DIR "tmp" extern struct passwd *user_detail; @@ -262,3 +265,5 @@ int run_docker(const char *command_file); * Sanitize docker commands. Returns NULL if there was any failure. */ char* sanitize_docker_command(const char *line); + +struct configuration* get_cfg(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c index b2187c9daf0..cd85972b23d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c @@ -20,6 +20,8 @@ #include "configuration.h" #include "container-executor.h" #include "util.h" +#include "modules/gpu/gpu-module.h" +#include "modules/cgroups/cgroups-operations.h" #include #include @@ -235,6 +237,8 @@ static struct { int container_pid; int signal; const char *docker_command_file; + const char *cgroups_param_path; + const char* cgroups_param_value; } cmd_input; static int validate_run_as_user_commands(int argc, char **argv, int *operation); @@ -253,6 +257,14 @@ static int validate_arguments(int argc, char **argv , int *operation) { return INVALID_ARGUMENT_NUMBER; } + /* + * Check if it is a known module, if yes, redirect to module + */ + if (strcmp("gpu", argv[1]) == 0) { + return handle_gpu_request(&update_cgroups_parameters, "gpu", argc - 1, + &argv[1]); + } + if (strcmp("--checksetup", argv[1]) == 0) { *operation = CHECK_SETUP; return 0; @@ -332,6 +344,7 @@ static int validate_arguments(int argc, char **argv , int *operation) { return FEATURE_DISABLED; } } + /* Now we have to validate 'run as user' operations that don't use a 'long option' - we should fix this at some point. The validation/argument parsing here is extensive enough that it done in a separate function */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.c new file mode 100644 index 00000000000..47814eca01d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.c @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "configuration.h" +#include "container-executor.h" +#include "utils/command-line-parser.h" +#include "utils/string-utils.h" +#include "modules/common/module-configs.h" +#include "modules/common/constants.h" +#include "modules/cgroups/cgroups-operations.h" +#include "util.h" + +#include +#include +#include +#include +#include + +#define CGROUPS_ROOT_KEY CONFIGS_MODULES_PREFIX "cgroups.root" +#define CGROUPS_YARN_HIERARCHY_KEY CONFIGS_MODULES_PREFIX "cgroups.yarn-hierarchy" + +int get_cgroups_path_to_write( + const char* controller_name, + const char* param_name, + const char* group_id, + char* output_path) { + const char* cgroups_root = get_section_value(CGROUPS_ROOT_KEY, get_cfg()); + const char* yarn_hierarchy_name = get_section_value(CGROUPS_YARN_HIERARCHY_KEY, get_cfg()); + + // Make sure it is defined. + if (!cgroups_root || strlen(cgroups_root) == 0) { + fprintf(LOGFILE, "%s is not defined in container-executor.cfg", + CGROUPS_ROOT_KEY); + return -1; + } + + // Make sure it is defined. + if (!yarn_hierarchy_name || strlen(yarn_hierarchy_name) == 0) { + fprintf(LOGFILE, "%s is not defined in container-executor.cfg", + CGROUPS_YARN_HIERARCHY_KEY); + return -1; + } + + // Make a path. + // CGroups path should not be too long. + sprintf(output_path, "%s/%s/%s/%s/%s.%s", + cgroups_root, controller_name, yarn_hierarchy_name, + group_id, controller_name, param_name); + + return 0; +} + +int update_cgroups_parameters( + const char* controller_name, + const char* param_name, + const char* group_id, + const char* value) { +#ifndef __linux + fprintf(LOGFILE, "Failed to update cgroups parameters, not supported\n"); + return -1; +#endif + + char full_path[4096]; + int rc = get_cgroups_path_to_write(controller_name, param_name, + group_id, full_path); + + if (0 != rc) { + fprintf(LOGFILE, + "Failed to get cgroups path to write, it should be a configuration issue"); + return -1; + } + + // Make sure file exist + struct stat sb; + if (stat(full_path, &sb) != 0) { + fprintf(LOGFILE, "CGroups: Could not find file to write, %s", full_path); + return -1; + } + + fprintf(LOGFILE, "CGroups: Updating cgroups, path=%s, value=%s", + full_path, value); + + // Write values to file + FILE *f; + f = fopen(full_path, "a"); + if (!f) { + fprintf(LOGFILE, "CGroups: Failed to open cgroups file, %s", full_path); + return -1; + } + if (fprintf(f, "%s", value) < 0) { + fprintf(LOGFILE, "CGroups: Failed to write cgroups file, %s", full_path); + return -1; + } + if (fclose(f) != 0) { + fprintf(LOGFILE, "CGroups: Failed to close cgroups file, %s", full_path); + return -1; + } + + return 0; +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.h new file mode 100644 index 00000000000..05539003709 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.h @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __FreeBSD__ +#define _WITH_GETLINE +#endif + +#ifndef _CGROUPS_OPERATIONS_H_ +#define _CGROUPS_OPERATIONS_H_ + +/** + * Handle gpu requests: + * - controller_name: e.g. devices + * - param_name: e.g. deny + * - group_id: e.g. container_x_y + * - value: e.g. "a *:* rwm" + * + * return 0 if succeeded + */ +int update_cgroups_parameters( + const char* controller_name, + const char* param_name, + const char* group_id, + const char* value); + + /** + * Get CGroups path to update. Visible for testing. + * Return 0 if succeeded + */ + int get_cgroups_path_to_write( + const char* controller_name, + const char* param_name, + const char* group_id, + char* output_path); + +#endif \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/constants.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/constants.h new file mode 100644 index 00000000000..5c8c4e939ee --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/constants.h @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* FreeBSD protects the getline() prototype. See getline(3) for more */ +#ifdef __FreeBSD__ +#define _WITH_GETLINE +#endif + +#ifndef _MODULES_COMMON_CONSTANTS_H_ +#define _MODULES_COMMON_CONSTANTS_H_ + +#define CONFIGS_MODULES_PREFIX "yarn.container-executor.modules." + +#endif \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/module-configs.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/module-configs.c new file mode 100644 index 00000000000..bd694bee138 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/module-configs.c @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util.h" +#include "configuration.h" +#include "container-executor.h" +#include "modules/common/constants.h" + +#include +#include +#include + +int module_enabled(const char* module_name) { + char* module_enabled_config_key; + int buffer_len = strlen(CONFIGS_MODULES_PREFIX) + strlen(module_name) + + strlen(".enabled") + 1; + + module_enabled_config_key = malloc(buffer_len); + + if (!module_enabled_config_key) { + fprintf(LOGFILE, "Failed to allocate memory for module_enabled_config_key"); + return 0; + } + + strcat(strcpy(module_enabled_config_key, CONFIGS_MODULES_PREFIX), module_name); + strcat(module_enabled_config_key, ".enabled"); + + char* enabled_str = get_section_value(module_enabled_config_key, get_cfg()); + int rc = 0; + if (enabled_str && 0 == strcmp(enabled_str, "true")) { + rc = 1; + } else { + fprintf(LOGFILE, "Module %s is disabled\n", module_name); + } + + free(enabled_str); + return rc; +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/module-configs.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/module-configs.h new file mode 100644 index 00000000000..65e868738be --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/common/module-configs.h @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __FreeBSD__ +#define _WITH_GETLINE +#endif + +#ifndef _MODULES_COMMON_MODULE_CONFIGS_H_ +#define _MODULES_COMMON_MODULE_CONFIGS_H_ + + +/** + * check if module enabled given name of module. + * return 0 if disabled + */ +int module_enabled(const char* module_name); + +#endif \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.c new file mode 100644 index 00000000000..dd796f13fe7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.c @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "configuration.h" +#include "container-executor.h" +#include "utils/command-line-parser.h" +#include "utils/string-utils.h" +#include "modules/gpu/gpu-module.h" +#include "modules/cgroups/cgroups-operations.h" +#include "modules/common/module-configs.h" +#include "modules/common/constants.h" +#include "util.h" + +#include +#include +#include +#include +#include + +#define EXCLUDED_GPUS_OPTION "excluded_gpus" +#define CONTAINER_ID_OPTION "container_id" +#define GPU_MAJOR_NUMBER_CONFIG_KEY CONFIGS_MODULES_PREFIX "gpu.major-device-number" +#define GPU_ALLOWED_DEVICES_MINOR_NUMBERS CONFIGS_MODULES_PREFIX "gpu.allowed-device-minor-numbers" + +static int internal_handle_gpu_request( + update_cgroups_parameters_func update_cgroups_parameters_func_p, + int n_minor_devices_to_block, int minor_devices[], + const char* container_id) { + if (n_minor_devices_to_block <= 0) { + // no device to block, just return; + return 0; + } + + // Get major device number from cfg, if not set, 195 (Nvidia) will be the + // default value + int major_device_number; + char* major_number_str = get_section_value(GPU_MAJOR_NUMBER_CONFIG_KEY, get_cfg()); + if (!major_number_str || 0 == strlen(major_number_str)) { + // Default major number of Nvidia devices + major_device_number = 195; + } else { + major_device_number = atoi(major_number_str); + } + + // Get allowed minor device numbers from cfg, if not set, means all minor + // devices can be used by YARN + char* allowed_minor_numbers_str = get_section_value(GPU_ALLOWED_DEVICES_MINOR_NUMBERS, + get_cfg()); + int* allowed_minor_numbers; + int n_allowed_minor_numbers = 0; + if (!allowed_minor_numbers_str || strlen(allowed_minor_numbers_str)) { + allowed_minor_numbers = NULL; + } else { + int rc = get_numbers_split_by_comma(allowed_minor_numbers_str, + &allowed_minor_numbers, + &n_allowed_minor_numbers); + if (0 != rc) { + fprintf(LOGFILE, + "Failed to get allowed minor device numbers from cfg, value=%s\n", + allowed_minor_numbers_str); + return -1; + } + + // Make sure we're trying to black devices allowed in config + for (int i = 0; i < n_minor_devices_to_block; i++) { + int found = 0; + for (int j = 0; j < n_allowed_minor_numbers; j++) { + if (minor_devices[i] == allowed_minor_numbers[j]) { + found = 1; + break; + } + } + + if (!found) { + fprintf(LOGFILE, + "Trying to blacklist device with minor-number=%d which is not on allowed list\n", + minor_devices[i]); + return -1; + } + } + } + + // Use cgroup helpers to blacklist devices + for (int i = 0; i < n_minor_devices_to_block; i++) { + char param_value[128]; + snprintf(param_value, 128, "c %d:%d rwm", major_device_number, i); + + int rc = update_cgroups_parameters_func_p("devices", "deny", + container_id, param_value); + + if (0 != rc) { + fprintf(LOGFILE, "CGroups: Failed to update cgroups\n"); + return -1; + } + } + + return 0; +} + +/* + * Format of GPU request commandline: + * + * c-e gpu --excluded_gpus 0,1,3 --container_id container_x_y + */ +int handle_gpu_request(update_cgroups_parameters_func func, + const char* module_name, int argc, char** argv) { + if (!module_enabled("gpu")) { + fprintf(LOGFILE, + "Please make sure gpu module is enabled before using it.\n"); + return -1; + } + + static struct option long_options[] = { + {EXCLUDED_GPUS_OPTION, required_argument, 0, 'e' }, + {CONTAINER_ID_OPTION, required_argument, 0, 'c' }, + {0, 0, 0, 0} + }; + + int rc = 0; + int c = 0; + int option_index = 0; + + int* minor_devices; + char container_id[128]; + int n_minor_devices_to_block = 0; + + while((c = getopt_long(argc, argv, "e:c:", long_options, &option_index)) != -1) { + switch(c) { + case 'e': + rc = get_numbers_split_by_comma(optarg, &minor_devices, + &n_minor_devices_to_block); + if (0 != rc) { + fprintf(LOGFILE, + "Failed to get minor devices number from command line, value=%s\n", + optarg); + return -1; + } + break; + case 'c': + if (!validate_container_id(optarg)) { + fprintf(LOGFILE, + "Specified container_id=%s is invalid\n", optarg); + return -1; + } + strcpy(container_id, optarg); + break; + default: + fprintf(LOGFILE, + "Unknown option in gpu command character %d %c, optionindex = %d\n", + c, c, optind); + fflush(LOGFILE); + return -1; + break; + } + } + + return internal_handle_gpu_request(func, n_minor_devices_to_block, + minor_devices, + container_id); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.h new file mode 100644 index 00000000000..898bb97fcf6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.h @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __FreeBSD__ +#define _WITH_GETLINE +#endif + +#ifndef _MODULES_GPU_GPU_MUDULE_H_ +#define _MODULES_GPU_GPU_MUDULE_H_ + +// For unit test stubbing +typedef int (*update_cgroups_parameters_func)(const char*, const char*, + const char*, const char*); + +/** + * Handle gpu requests + */ +int handle_gpu_request(update_cgroups_parameters_func func, + const char* module_name, int argc, char** argv); + +#endif \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/command-line-parser.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/command-line-parser.c new file mode 100644 index 00000000000..d240c8d508b --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/command-line-parser.c @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/command-line-parser.h" +#include "util.h" +#include "container-executor.h" + +#include +#include +#include + +struct parsed_command_line* parse_commandline_opts(int argc, char** argv, int n_known_parameters, + const char** known_parameters, const int required[], const int has_values[]) { + struct parsed_command_line* opts = malloc(sizeof(struct parsed_command_line)); + if (!opts) { + fprintf(LOGFILE, "Failed to malloc parsed_command_options\n"); + return NULL; + } + + opts->keys = malloc(sizeof(char*) * (argc + 1)); + opts->values = malloc(sizeof(char*) * (argc + 1)); + + if (!opts->keys || !opts->values) { + fprintf(LOGFILE, "Failed to malloc keys or values of opts\n"); + return NULL; + } + + // Validate inputs + // Make sure all option-related are not null + if (!(known_parameters && required && has_values)) { + fprintf(LOGFILE, + "Please make sure known_parameters / required / has_values are set\n"); + return NULL; + } + + // Start parse commandline + int input_argv_idx = 0; + opts->n_options = 0; + + while (input_argv_idx < argc) { + // get parameter_name + char* param_name = argv[input_argv_idx]; + + // make sure param_name start with "--" + if (0 != strncmp("--", param_name, 2)) { + fprintf(LOGFILE, "option %s is not started with \"--\"\n", param_name); + return NULL; + } + + // Exclude "--" prefix + param_name += 2; + + int param_idx = -1; + + for (int i = 0; i < n_known_parameters; i++) { + if (0 == strcmp(known_parameters[i], param_name)) { + param_idx = i; + break; + } + } + + if (param_idx < 0) { + fprintf(LOGFILE, "cannot find parameter %s from known parameters\n", param_name); + return NULL; + } + + opts->keys[opts->n_options] = param_name; + + // Check if we need value followed by the param + if (has_values[param_idx]) { + // Parse value + input_argv_idx++; + + if (input_argv_idx >= argc) { + fprintf(LOGFILE, "unexpected end of commandline while parsing param=%s\n", + param_name); + return NULL; + } + + opts->values[opts->n_options] = argv[input_argv_idx]; + } + + opts->n_options++; + input_argv_idx++; + } + + // Make sure all required parameters are set + for (int i = 0; i < n_known_parameters; i++) { + if (required[i]) { + const char* required_key = known_parameters[i]; + + int find = 0; + for (int j = 0; j < opts->n_options; j++) { + if (0 == strcmp(opts->keys[j], required_key)) { + find = 1; + break; + } + } + + if (!find) { + fprintf(LOGFILE, "%s is required but not specified in commandline.\n", + required_key); + return NULL; + } + } + } + + return opts; +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/command-line-parser.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/command-line-parser.h new file mode 100644 index 00000000000..4a99061b866 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/command-line-parser.h @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __FreeBSD__ +#define _WITH_GETLINE +#endif + +#ifndef _UTILS_COMMAND_LINE_PARSER_H_ +#define _UTILS_COMMAND_LINE_PARSER_H_ + +struct parsed_command_line { + int n_options; + char** keys; + char** values; +}; + +/* + * Return a parsed commandline options. + * As usual: + * - argc / argv + * + * In addition to that, you need to specify: + * - known_parameters, without "--" + * - if these parameters are required (1 is required) + * - is there any values followed by the option (1 means has value) + */ +struct parsed_command_line* parse_commandline_opts(int argc, char** argv, + int n_known_parameters, const char** known_parameters, + const int required[], const int has_values[]); + +#endif diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/string-utils.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/string-utils.c new file mode 100644 index 00000000000..62251cd35ce --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/string-utils.c @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +/* + * if all chars in the input str are numbers + * return true/false + */ +static int all_numbers(char* input) { + if (strlen(input) == 0) { + return 0; + } + + for (int i = 0; i < strlen(input); i++) { + if (input[i] < '0' || input[i] > '9') { + return 0; + } + } + return 1; +} + +int get_numbers_split_by_comma(char* input, int** numbers, int* ret_n_numbers) { + int n_numbers = 1; + for (int i = 0; i < strlen(input); i++) { + if (input[i] == ',') { + n_numbers++; + } + } + + int* arr = (*numbers); + arr = malloc(sizeof(int) * n_numbers); + if (!arr) { + return -1; + } + + char* input_cpy = malloc(strlen(input)); + strcpy(input_cpy, input); + + char* p = strtok(input_cpy, ","); + int idx = 0; + while (p != NULL) { + int n = atoi(p); + arr[idx] = n; + p = strtok(NULL, ","); + idx++; + } + + free(input_cpy); + *ret_n_numbers = n_numbers; + + return 0; +} + +int validate_container_id(char* input) { + /* + * Two different forms of container_id + * container_e17_1410901177871_0001_01_000005 + * container_1410901177871_0001_01_000005 + */ + char* input_cpy = malloc(strlen(input)); + strcpy(input_cpy, input); + char* p = strtok(input_cpy, "_"); + int idx = 0; + while (p != NULL) { + if (0 == idx) { + if (0 != strcmp("container", p)) { + return 0; + } + } else if (1 == idx) { + // this could be e[n][n], or [n][n]... + if (!all_numbers(p)) { + if (strlen(p) == 0) { + return 0; + } + if (p[0] != 'e') { + return 0; + } + if (!all_numbers(p + 1)) { + return 0; + } + } + } else { + // otherwise, should be all numbers + if (!all_numbers(p)) { + return 0; + } + } + + p = strtok(NULL, "_"); + idx++; + } + free(input_cpy); + + // We should have [5,6] elements split by '_' + if (idx > 6 || idx < 5) { + return 0; + } + return 1; +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/string-utils.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/string-utils.h new file mode 100644 index 00000000000..25f54a2016c --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/string-utils.h @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __FreeBSD__ +#define _WITH_GETLINE +#endif + +#ifndef _UTILS_STRING_UTILS_H_ +#define _UTILS_STRING_UTILS_H_ + +/* + * Get numbers split by comma from a input string + * return 0 if succeeded + */ +int get_numbers_split_by_comma(char* input, int** numbers, int* n_numbers); + +/* + * Get numbers split by comma from a input string + * return false/true + */ +int validate_container_id(char* input); + +#endif diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc new file mode 100644 index 00000000000..a8c9ddcb22c --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern "C" { +#include "configuration.h" +#include "container-executor.h" +#include "modules/cgroups/cgroups-operations.h" +#include "test/test-container-executor-common.h" +#include "util.h" +} + +namespace ContainerExecutor { + +class TestCGroupsModule : public ::testing::Test { +protected: + virtual void SetUp() { + if (mkdirs(TEST_ROOT, 0755) != 0) { + exit(1); + } + } + + virtual void TearDown() {} +}; + +TEST_F(TestCGroupsModule, test_cgroups_get_path_without_define_root) { + printf("\nTesting %s\n", __func__); + + // Write config file. + char *filename = TEST_ROOT "/test_cgroups_get_path_without_root.cfg"; + FILE *file = fopen(filename, "w"); + if (file == NULL) { + printf("FAIL: Could not open configuration file: %s\n", filename); + exit(1); + } + fprintf(file, + "yarn.container-executor.modules.cgroups.yarn-hierarchy=yarn \n"); + fclose(file); + + // Read config file + read_executor_config(filename); + char cgroup_full_path[4096]; + + int rc = get_cgroups_path_to_write("devices", "deny", "container_1", + cgroup_full_path); + + ASSERT_NE(0, rc) << "Should fail.\n"; +} + +TEST_F(TestCGroupsModule, test_cgroups_get_path_without_define_yarn_hierarchy) { + printf("\nTesting %s\n", __func__); + + // Write config file. + char *filename = TEST_ROOT "/test_cgroups_get_path_without_root.cfg"; + FILE *file = fopen(filename, "w"); + + ASSERT_TRUE(file) << "FAIL: Could not open configuration file: " << filename + << "\n"; + fprintf(file, + "yarn.container-executor.modules.cgroups.yarn-hierarchy=yarn \n"); + fclose(file); + + // Read config file + read_executor_config(filename); + char cgroup_full_path[4096]; + + int rc = get_cgroups_path_to_write("devices", "deny", "container_1", + cgroup_full_path); + + ASSERT_NE(0, rc) << "Should fail.\n"; +} + +TEST_F(TestCGroupsModule, test_cgroups_get_path_succeeded) { + printf("\nTesting %s\n", __func__); + + // Write config file. + char *filename = TEST_ROOT "/test_cgroups_get_path.cfg"; + FILE *file = fopen(filename, "w"); + + ASSERT_TRUE(file) << "FAIL: Could not open configuration file\n"; + fprintf(file, + "yarn.container-executor.modules.cgroups.root=/sys/fs/cgroups\n"); + fprintf(file, + "yarn.container-executor.modules.cgroups.yarn-hierarchy=yarn \n"); + fclose(file); + + // Read config file + read_executor_config(filename); + char cgroup_full_path[4096]; + + int rc = get_cgroups_path_to_write("devices", "deny", "container_1", + cgroup_full_path); + + const char *EXPECTED = + "/sys/fs/cgroups/devices/yarn/container_1/devices.deny"; + + ASSERT_STREQ(EXPECTED, cgroup_full_path) + << "Return cgroup-path-to-write is not expected\n"; +} +} // namespace ContainerExecutor \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/gpu/test-gpu-module.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/gpu/test-gpu-module.cc new file mode 100644 index 00000000000..46289370b8d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/gpu/test-gpu-module.cc @@ -0,0 +1,192 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern "C" { +#include "configuration.h" +#include "container-executor.h" +#include "modules/cgroups/cgroups-operations.h" +#include "modules/gpu/gpu-module.h" +#include "test/test-container-executor-common.h" +#include "util.h" +} + +namespace ContainerExecutor { + +class TestGpuModule : public ::testing::Test { +protected: + virtual void SetUp() { + if (mkdirs(TEST_ROOT, 0755) != 0) { + exit(1); + } + } + + virtual void TearDown() { + + } +}; + +static std::vector cgroups_parameters_invoked; + +static int mock_update_cgroups_parameters( + const char* controller_name, + const char* param_name, + const char* group_id, + const char* value) { + char* buf = (char*) malloc(128); + strcpy(buf, controller_name); + cgroups_parameters_invoked.push_back(buf); + + buf = (char*) malloc(128); + strcpy(buf, param_name); + cgroups_parameters_invoked.push_back(buf); + + buf = (char*) malloc(128); + strcpy(buf, group_id); + cgroups_parameters_invoked.push_back(buf); + + buf = (char*) malloc(128); + strcpy(buf, value); + cgroups_parameters_invoked.push_back(buf); + return 0; +} + +static void verify_param_updated_to_cgroups( + int argc, const char** argv) { + ASSERT_EQ(argc, cgroups_parameters_invoked.size()); + + int offset = 0; + while (offset < argc) { + ASSERT_STREQ(argv[offset], cgroups_parameters_invoked[offset]); + offset++; + } +} + +static void write_and_load_gpu_module_to_cfg(const char* cfg_filepath, int enabled) { + FILE *file = fopen(cfg_filepath, "w"); + if (file == NULL) { + printf("FAIL: Could not open configuration file: %s\n", cfg_filepath); + exit(1); + } + if (enabled) { + fprintf(file, "yarn.container-executor.modules.gpu.enabled=true\n"); + } else { + fprintf(file, "yarn.container-executor.modules.gpu.enabled=false\n"); + } + fclose(file); + + // Read config file + read_executor_config(cfg_filepath); +} + +static void test_gpu_module_enabled_disabled(int enabled) { + // Write config file. + char *filename = TEST_ROOT "/test_cgroups_module_enabled_disabled.cfg"; + write_and_load_gpu_module_to_cfg(filename, enabled); + + char* argv[] = { "gpu", "--excluded_gpus", "0,1", + "--container_id", + "container_1498064906505_0001_01_000001" }; + + int rc = handle_gpu_request(&mock_update_cgroups_parameters, + "gpu", 5, argv); + + int EXPECTED_RC; + if (enabled) { + EXPECTED_RC = 0; + } else { + EXPECTED_RC = -1; + } + ASSERT_EQ(EXPECTED_RC, rc); +} + +TEST_F(TestGpuModule, test_verify_gpu_module_calls_group_parameter) { + // Write config file. + char *filename = TEST_ROOT "/test_verify_gpu_module_calls_group_parameter.cfg"; + write_and_load_gpu_module_to_cfg(filename, 1); + + char* container_id = "container_1498064906505_0001_01_000001"; + char* argv[] = { "gpu", "--excluded_gpus", "0,1", + "--container_id", + container_id }; + + /* Test case 1: block 2 devices */ + cgroups_parameters_invoked.clear(); + int rc = handle_gpu_request(&mock_update_cgroups_parameters, + "gpu", 5, argv); + ASSERT_EQ(0, rc) << "Should success.\n"; + + // Verify cgroups parameters + const char* expected_cgroups_argv[] = { "devices", "deny", container_id, "c 195:0 rwm", + "devices", "deny", container_id, "c 195:1 rwm"}; + verify_param_updated_to_cgroups(8, expected_cgroups_argv); + + /* Test case 2: block 0 devices */ + cgroups_parameters_invoked.clear(); + char* argv_1[] = { "gpu", "--container_id", container_id }; + rc = handle_gpu_request(&mock_update_cgroups_parameters, + "gpu", 3, argv_1); + ASSERT_EQ(0, rc) << "Should success.\n"; + + // Verify cgroups parameters + verify_param_updated_to_cgroups(0, NULL); +} + +TEST_F(TestGpuModule, test_illegal_cli_parameters) { + // Write config file. + char *filename = TEST_ROOT "/test_illegal_cli_parameters.cfg"; + write_and_load_gpu_module_to_cfg(filename, 1); + + // Illegal container id - 1 + char* argv[] = { "gpu", "--excluded_gpus", "0,1", + "--container_id", "xxxx" }; + int rc = handle_gpu_request(&mock_update_cgroups_parameters, + "gpu", 5, argv); + ASSERT_NE(0, rc) << "Should fail.\n"; + + // Illegal container id - 2 + char* argv_1[] = { "gpu", "--excluded_gpus", "0,1", + "--container_id", "container_1" }; + rc = handle_gpu_request(&mock_update_cgroups_parameters, + "gpu", 5, argv); + ASSERT_NE(0, rc) << "Should fail.\n"; +} + +TEST_F(TestGpuModule, test_gpu_module_disabled) { + test_gpu_module_enabled_disabled(0); +} + +TEST_F(TestGpuModule, test_gpu_module_enabled) { + test_gpu_module_enabled_disabled(1); +} +} // namespace ContainerExecutor \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor-common.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor-common.h new file mode 100644 index 00000000000..d3536252025 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor-common.h @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifdef __APPLE__ + #include + #include + + #define TMPDIR "/private/tmp" + #define RELTMPDIR "../.." + #else + #define RELTMPDIR ".." + #define TMPDIR "/tmp" + #endif + + #define TEST_ROOT TMPDIR "/test-container-executor" + + #define DONT_TOUCH_FILE "dont-touch-me" + #define NM_LOCAL_DIRS TEST_ROOT "/local-1%" TEST_ROOT "/local-2%" \ + TEST_ROOT "/local-3%" TEST_ROOT "/local-4%" TEST_ROOT "/local-5" + #define NM_LOG_DIRS TEST_ROOT "/logs/userlogs" + #define ARRAY_SIZE 1000 \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c index 4a84bf0045e..4735e1a0bfa 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c @@ -18,6 +18,7 @@ #include "configuration.h" #include "container-executor.h" #include "util.h" +#include "test/test-container-executor-common.h" #include #include @@ -30,25 +31,6 @@ #include #include -#ifdef __APPLE__ -#include -#include - -#define TMPDIR "/private/tmp" -#define RELTMPDIR "../.." -#else -#define RELTMPDIR ".." -#define TMPDIR "/tmp" -#endif - -#define TEST_ROOT TMPDIR "/test-container-executor" - -#define DONT_TOUCH_FILE "dont-touch-me" -#define NM_LOCAL_DIRS TEST_ROOT "/local-1%" TEST_ROOT "/local-2%" \ - TEST_ROOT "/local-3%" TEST_ROOT "/local-4%" TEST_ROOT "/local-5" -#define NM_LOG_DIRS TEST_ROOT "/logs/userlogs" -#define ARRAY_SIZE 1000 - static char* username = NULL; static char* yarn_username = NULL; static char** local_dirs = NULL; @@ -1118,7 +1100,6 @@ void test_sanitize_docker_command() { free(command); } } - // This test is expected to be executed either by a regular // user or by root. If executed by a regular user it doesn't // test all the functions that would depend on changing the @@ -1284,10 +1265,7 @@ int main(int argc, char **argv) { test_check_user(1); #endif - run("rm -fr " TEST_ROOT); - test_trim_function(); - printf("\nFinished tests\n"); free(current_username); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test_main.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test_main.cc index d59a3f22a13..44c9b1bc5c0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test_main.cc +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test_main.cc @@ -20,10 +20,13 @@ #include
#include -FILE* ERRORFILE = stderr; -FILE* LOGFILE = stdout; +extern "C" { +#include "util.h" +} int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + ERRORFILE = stderr; + LOGFILE = stdout; + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index fb90ab4f55d..1ec15632f8b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -108,6 +108,7 @@ import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.util.timeline.TimelineUtils; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -393,22 +394,8 @@ public void testNMRecoveryForAppFinishedWithLogAggregationFailure() cm.stop(); } - @Test - public void testContainerResizeRecovery() throws Exception { - conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true); - conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true); - NMStateStoreService stateStore = new NMMemoryStateStoreService(); - stateStore.init(conf); - stateStore.start(); - Context context = createContext(conf, stateStore); - ContainerManagerImpl cm = createContainerManager(context, delSrvc); - cm.init(conf); - cm.start(); - // add an application by starting a container - ApplicationId appId = ApplicationId.newInstance(0, 1); - ApplicationAttemptId attemptId = - ApplicationAttemptId.newInstance(appId, 1); - ContainerId cid = ContainerId.newContainerId(attemptId, 1); + private void commonLaunchContainer(ApplicationId appId, ContainerId cid, + ContainerManagerImpl cm) throws Exception { Map containerEnv = new HashMap<>(); setFlowContext(containerEnv, "app_name1", appId); Map serviceData = Collections.emptyMap(); @@ -452,12 +439,35 @@ public void testContainerResizeRecovery() throws Exception { context, cm, cid, clc, null); assertTrue(startResponse.getFailedRequests().isEmpty()); assertEquals(1, context.getApplications().size()); - Application app = context.getApplications().get(appId); - assertNotNull(app); // make sure the container reaches RUNNING state waitForNMContainerState(cm, cid, org.apache.hadoop.yarn.server.nodemanager .containermanager.container.ContainerState.RUNNING); + + } + + @Test + public void testContainerResizeRecovery() throws Exception { + conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true); + conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true); + NMStateStoreService stateStore = new NMMemoryStateStoreService(); + stateStore.init(conf); + stateStore.start(); + context = createContext(conf, stateStore); + ContainerManagerImpl cm = createContainerManager(context, delSrvc); + cm.init(conf); + cm.start(); + // add an application by starting a container + ApplicationId appId = ApplicationId.newInstance(0, 1); + ApplicationAttemptId attemptId = + ApplicationAttemptId.newInstance(appId, 1); + ContainerId cid = ContainerId.newContainerId(attemptId, 1); + + commonLaunchContainer(appId, cid, cm); + + Application app = context.getApplications().get(appId); + assertNotNull(app); + Resource targetResource = Resource.newInstance(2048, 2); IncreaseContainersResourceResponse increaseResponse = increaseContainersResource(context, cm, cid, targetResource); @@ -480,6 +490,52 @@ public void testContainerResizeRecovery() throws Exception { } @Test + public void testResourceMappingRecoveryForContainer() throws Exception { + conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true); + conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true); + NMStateStoreService stateStore = new NMMemoryStateStoreService(); + stateStore.init(conf); + stateStore.start(); + context = createContext(conf, stateStore); + ContainerManagerImpl cm = createContainerManager(context, delSrvc); + cm.init(conf); + cm.start(); + + // add an application by starting a container + ApplicationId appId = ApplicationId.newInstance(0, 1); + ApplicationAttemptId attemptId = + ApplicationAttemptId.newInstance(appId, 1); + ContainerId cid = ContainerId.newContainerId(attemptId, 1); + + commonLaunchContainer(appId, cid, cm); + + Application app = context.getApplications().get(appId); + assertNotNull(app); + + // store resource mapping of the container + stateStore.storeAssignedResources(cid, "gpu", + Arrays.asList("1", "2", "3")); + + cm.stop(); + context = createContext(conf, stateStore); + cm = createContainerManager(context); + cm.init(conf); + cm.start(); + assertEquals(1, context.getApplications().size()); + app = context.getApplications().get(appId); + assertNotNull(app); + + Container nmContainer = context.getContainers().get(cid); + Assert.assertNotNull(nmContainer); + List assignedResource = + nmContainer.getResourceMappings().getAssignedResources("gpu"); + Assert.assertNotNull(assignedResource); + Assert.assertEquals(3, assignedResource.size()); + Assert.assertTrue( + assignedResource.containsAll(Arrays.asList("1", "2", "3"))); + } + + @Test public void testContainerCleanupOnShutdown() throws Exception { ApplicationId appId = ApplicationId.newInstance(0, 1); ApplicationAttemptId attemptId = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestGpuResourceHandler.java new file mode 100644 index 00000000000..c83000305fb --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestGpuResourceHandler.java @@ -0,0 +1,305 @@ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static org.mockito.Matchers.anyList; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestGpuResourceHandler { + private CGroupsHandler mockCGroupsHandler; + private PrivilegedOperationExecutor mockPrivilegedExecutor; + private GpuResourceHandlerImpl gpuResourceHandler; + private NMStateStoreService mockNMStateStore; + private ConcurrentHashMap runningContainersMap; + + @Before + public void setup() { + mockCGroupsHandler = mock(CGroupsHandler.class); + mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class); + mockNMStateStore = mock(NMStateStoreService.class); + + Context nmctx = mock(Context.class); + when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore); + runningContainersMap = new ConcurrentHashMap<>(); + when(nmctx.getContainers()).thenReturn(runningContainersMap); + GpuResourceAllocator gpuResourceAllocator = new GpuResourceAllocator(nmctx); + + gpuResourceHandler = new GpuResourceHandlerImpl(mockCGroupsHandler, + mockPrivilegedExecutor, gpuResourceAllocator); + } + + @Test + public void testBootStrap() throws ResourceHandlerException { + Configuration conf = new YarnConfiguration(); + + gpuResourceHandler.bootstrap(conf); + verify(mockCGroupsHandler, times(1)).initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + } + + private static ContainerId getContainerId(int id) { + return ContainerId.newContainerId(ApplicationAttemptId + .newInstance(ApplicationId.newInstance(1234L, 1), 1), id); + } + + private static Container mockContainerWithGpuRequest(int id, int numGpuRequest) { + Container c = mock(Container.class); + when(c.getContainerId()).thenReturn(getContainerId(id)); + ContainerLaunchContext clc = mock(ContainerLaunchContext.class); + Map envs = new HashMap<>(); + when(clc.getEnvironment()).thenReturn(envs); + envs.put("REQUESTED_GPU_NUM", String.valueOf(numGpuRequest)); + when(c.getLaunchContext()).thenReturn(clc); + return c; + } + + private void verifyDeniedDevices(ContainerId containerId, List deniedDevices) + throws ResourceHandlerException, PrivilegedOperationException { + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, containerId.toString()); + + if (null != deniedDevices && !deniedDevices.isEmpty()) { + verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation( + new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays + .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION, + containerId.toString(), + GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", deniedDevices))), true); + } + } + + @Test + public void testAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(4)); + + /* Start container 2, asks 2 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Start container 3, ask 1 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1)); + + // devices = 0/1/3 will be blocked + verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3)); + + /* Start container 4, ask 0 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0)); + + // All devices will be blocked + verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4)); + + /* Release container-1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(3, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Release container-3, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(3)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString()); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationWithoutAllowedGpus() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, ""); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 0 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0)); + verifyDeniedDevices(getContainerId(1), Collections.emptyList()); + + /* Start container 2, asks 1 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Release container 1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationStored() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + + verify(mockNMStateStore).storeAssignedResources(getContainerId(1), "gpu", + Arrays.asList("0", "1", "3")); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(4)); + + /* Start container 2, ask 0 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 0)); + + verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4)); + + // Store assigned resource will not be invoked. + verify(mockNMStateStore, never()).storeAssignedResources( + eq(getContainerId(2)), eq("gpu"), anyList()); + } + + @Test + public void testRecoverResourceAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + Container nmContainer = mock(Container.class); + ResourceMappings rmap = new ResourceMappings(); + ResourceMappings.AssignedResources ar = + new ResourceMappings.AssignedResources(); + ar.updateAssignedResources(Arrays.asList("1", "3")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(1), nmContainer); + + // TEST CASE + // Reacquire container restore state of GPU Resource Allocator. + gpuResourceHandler.reacquireContainer(getContainerId(1)); + + Map deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is not in allowed list. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=5 is not in allowed list. + ar.updateAssignedResources(Arrays.asList("4", "5")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + boolean caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is already assigned. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=3 is already assigned + ar.updateAssignedResources(Arrays.asList("4", "3")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java index 15c0e84eaae..76774da23ff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; public class NMMemoryStateStoreService extends NMStateStoreService { private Map apps; @@ -118,6 +119,7 @@ public synchronized void removeApplication(ApplicationId appId) rcsCopy.setRemainingRetryAttempts(rcs.getRemainingRetryAttempts()); rcsCopy.setWorkDir(rcs.getWorkDir()); rcsCopy.setLogDir(rcs.getLogDir()); + rcsCopy.resMappings = rcs.getResourceMappings(); result.add(rcsCopy); } return result; @@ -417,6 +419,16 @@ public synchronized void removeLogDeleter(ApplicationId appId) logDeleterState.remove(appId); } + @Override + public void storeAssignedResources(ContainerId containerId, + String resourceType, List assignedResources) throws IOException { + ResourceMappings.AssignedResources ar = + new ResourceMappings.AssignedResources(); + ar.updateAssignedResources(assignedResources); + containerStates.get(containerId).getResourceMappings().addAssignedResources( + resourceType, ar); + } + private static class TrackerState { Map inProgressMap = new HashMap(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java index 69094741faa..61e12f2230a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java @@ -33,10 +33,12 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; +import com.sun.tools.javadoc.Start; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; @@ -948,18 +950,8 @@ protected DB openDatabase(Configuration conf) { store.close(); } - @Test - public void testUnexpectedKeyDoesntThrowException() throws IOException { - // test empty when no state - List recoveredContainers = stateStore - .loadContainersState(); - assertTrue(recoveredContainers.isEmpty()); - + private StartContainerRequest storeMockContainer(ContainerId containerId) throws IOException { // create a container request - ApplicationId appId = ApplicationId.newInstance(1234, 3); - ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, - 4); - ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5); LocalResource lrsrc = LocalResource.newInstance( URL.newInstance("hdfs", "somehost", 12345, "/some/path/to/rsrc"), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, 123L, @@ -993,8 +985,23 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException { "tokenservice"); StartContainerRequest containerReq = StartContainerRequest.newInstance(clc, containerToken); - stateStore.storeContainer(containerId, 0, containerReq); + return containerReq; + } + + @Test + public void testUnexpectedKeyDoesntThrowException() throws IOException { + // test empty when no state + List recoveredContainers = stateStore + .loadContainersState(); + assertTrue(recoveredContainers.isEmpty()); + + ApplicationId appId = ApplicationId.newInstance(1234, 3); + ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, + 4); + ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5); + + StartContainerRequest startContainerRequest = storeMockContainer(containerId); // add a invalid key byte[] invalidKey = ("ContainerManager/containers/" @@ -1007,7 +1014,7 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException { assertEquals(RecoveredContainerStatus.REQUESTED, rcs.getStatus()); assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode()); assertEquals(false, rcs.getKilled()); - assertEquals(containerReq, rcs.getStartRequest()); + assertEquals(startContainerRequest, rcs.getStartRequest()); assertTrue(rcs.getDiagnostics().isEmpty()); assertEquals(RecoveredContainerType.KILL, rcs.getRecoveryType()); // assert unknown keys are cleaned up finally @@ -1016,6 +1023,45 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException { assertNull(stateStore.getDB().get(invalidKey)); } + @Test + public void testStateStoreForResourceMapping() throws IOException { + // test empty when no state + List recoveredContainers = stateStore + .loadContainersState(); + assertTrue(recoveredContainers.isEmpty()); + + ApplicationId appId = ApplicationId.newInstance(1234, 3); + ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, + 4); + ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5); + storeMockContainer(containerId); + + // Store ResourceMapping + stateStore.storeAssignedResources(containerId, "gpu", + Arrays.asList("1", "2", "3")); + // This will overwrite above + stateStore.storeAssignedResources(containerId, "gpu", + Arrays.asList("1", "2", "4")); + stateStore.storeAssignedResources(containerId, "fpga", + Arrays.asList("3", "4", "5", "6")); + + // add a invalid key + restartStateStore(); + recoveredContainers = stateStore.loadContainersState(); + assertEquals(1, recoveredContainers.size()); + RecoveredContainerState rcs = recoveredContainers.get(0); + List res = rcs.getResourceMappings().getAssignedResources("gpu"); + Assert.assertNotNull(res); + Assert.assertEquals(3, res.size()); + Assert.assertTrue(res.containsAll(Arrays.asList("1", "2", "4"))); + + res = rcs.getResourceMappings().getAssignedResources("fpga"); + Assert.assertNotNull(res); + Assert.assertEquals(4, res.size()); + Assert.assertTrue(res.containsAll(Arrays.asList("3", "4", "5", "6"))); + } + + private static class NMTokenSecretManagerForTest extends BaseNMTokenSecretManager { public MasterKey generateKey() { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 022baeac492..cc94288595e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceSet; import org.apache.hadoop.yarn.server.utils.BuilderUtils; @@ -235,4 +236,9 @@ public void sendKillEvent(int exitStatus, String description) { public boolean isRecovering() { return false; } + + @Override + public ResourceMappings getResourceMappings() { + return null; + } }