diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index be63233dfc8..9af8ee005ac 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1307,6 +1307,45 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT = NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit"; + /** + * Prefix for gpu configurations. Work in progress: This configuration + * parameter may be changed/removed in the future. + */ + @Private + public static final String NM_GPU_RESOURCE_PREFIX = NM_PREFIX + + "resource.gpu."; + /** + * This setting controls if resource handling for GPU operations is enabled. + */ + @Private + public static final String NM_GPU_RESOURCE_ENABLED = + NM_GPU_RESOURCE_PREFIX + "enabled"; + + @Private + public static final String NM_GPU_ALLOWED_DEVICES = + NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices"; + @Private + public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = + ""; + + /** + * This setting controls where to how to invoke GPU binaries + * (such as nvidia-smi). + */ + @Private + public static final String NM_GPU_PATH_TO_EXEC = + NM_GPU_RESOURCE_PREFIX + "path-to-executables"; + + @Private + public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; + + /** + * Disk as a resource is disabled by default. + **/ + @Private + public static final boolean DEFAULT_NM_GPU_RESOURCE_ENABLED = false; + + /** NM Webapp address.**/ public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address"; public static final int DEFAULT_NM_WEBAPP_PORT = 8042; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index afde222c54b..2f3553bbe20 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -3297,4 +3297,20 @@ false + + + Enable GPU on this node manager or not + + yarn.nodemanager.resource.gpu.enabled + false + + + + + Specify GPU devices which can be managed by YARN + + yarn.nodemanager.resource.gpu.allowed-gpu-devices + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 3e919c5cdad..e4413a99f28 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -26,6 +26,8 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceDiscoverPlugin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -65,6 +67,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -72,6 +75,7 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators.LocalResourceAllocators; import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; @@ -325,6 +329,22 @@ protected void serviceInit(Configuration conf) throws Exception { this.aclsManager = new ApplicationACLsManager(conf); + boolean isDistSchedulingEnabled = + conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED, + YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED); + + this.context = createNMContext(containerTokenSecretManager, + nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf); + + // Initialize local allocators + // This has to be done before initialize ContainerExecutor + if (conf.getBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, + YarnConfiguration.DEFAULT_NM_GPU_RESOURCE_ENABLED)) { + LocalResourceAllocators.setGpuResourceAllocator( + new GpuResourceAllocator(context)); + GpuResourceDiscoverPlugin.getInstance().setConf(conf); + } + ContainerExecutor exec = ReflectionUtils.newInstance( conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class), conf); @@ -345,13 +365,6 @@ protected void serviceInit(Configuration conf) throws Exception { getNodeHealthScriptRunner(conf), dirsHandler); addService(nodeHealthChecker); - boolean isDistSchedulingEnabled = - conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED, - YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED); - - this.context = createNMContext(containerTokenSecretManager, - nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf); - ((NMContext)context).setContainerExecutor(exec); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java index 8402a16339d..db0b2251578 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java @@ -51,6 +51,7 @@ TC_READ_STATS("--tc-read-stats"), ADD_PID_TO_CGROUP(""), //no CLI switch supported yet. RUN_DOCKER_CMD("--run-docker"), + GPU("--module-gpu"), LIST_AS_USER(""); //no CLI switch supported yet. private final String option; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java index 3c61cd4b5be..f70e7baf0c9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java @@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -28,6 +29,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators.LocalResourceAllocators; import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler; import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler; @@ -95,6 +97,19 @@ public static CGroupsHandler getCGroupsHandler() { return cGroupsHandler; } + private static GpuResourceHandlerImpl getGpuResourceHandler( + Configuration conf) throws ResourceHandlerException { + boolean cgroupsGpuEnabled = conf.getBoolean( + YarnConfiguration.NM_GPU_RESOURCE_ENABLED, + YarnConfiguration.DEFAULT_NM_GPU_RESOURCE_ENABLED); + if (cgroupsGpuEnabled) { + return new GpuResourceHandlerImpl(getInitializedCGroupsHandler(conf), + PrivilegedOperationExecutor.getInstance(conf), + LocalResourceAllocators.getGpuResourceAllocator()); + } + return null; + } + private static CGroupsCpuResourceHandlerImpl getCGroupsCpuResourceHandler( Configuration conf) throws ResourceHandlerException { boolean cgroupsCpuEnabled = @@ -213,6 +228,7 @@ private static void initializeConfiguredResourceHandlerChain( addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf)); addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf)); addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf)); + addHandlerIfNotNull(handlerList, getGpuResourceHandler(conf)); resourceHandlerChain = new ResourceHandlerChain(handlerList); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuDeviceInformation.java new file mode 100644 index 00000000000..3e580718a83 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuDeviceInformation.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import java.util.List; + +/** + * All GPU Device Information in the system. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuDeviceInformation { + List gpus; + String driverVersion = "N/A"; + + // More fields like topology information could be added when needed. + // ... + + public List getGpus() { + return gpus; + } + + public void setGpus(List gpus) { + this.gpus = gpus; + } + + public String getDriverVersion() { + return driverVersion; + } + + public void setDriverVersion(String driverVersion) { + this.driverVersion = driverVersion; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("=== Gpus in the system ===\n").append("\tVersion:").append( + getDriverVersion()).append("\n"); + for (PerGpuDeviceInformation gpu : gpus) { + sb.append("\t").append(gpu.toString()).append("\n"); + } + return sb.toString(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuDeviceInformationParser.java new file mode 100644 index 00000000000..bd83ac44e43 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuDeviceInformationParser.java @@ -0,0 +1,243 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +/** + * Parse XML and get GPU device information + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuDeviceInformationParser { + private static final Logger LOG = + LoggerFactory.getLogger(GpuDeviceInformationParser.class); + + // 0(K) + private static final float ABSOLUTE_ZERO = -459.67f; + + private static String getFirstSubElementValueByTag(Element parentElement, + String tag) { + NodeList list = parentElement.getChildNodes(); + for (int i = 0; i < list.getLength(); i++) { + Node n = list.item(i); + if (n instanceof Element) { + Element e = (Element) n; + if (e.getTagName().equals(tag)) { + return ((Text) e.getFirstChild()).getData().trim(); + } + } + } + + return null; + } + + private static int convertStringToNumber(String str) { + if (null != str) { + return Integer.parseInt(str); + } + return -1; + } + + private static Element getFirstElementByTag(Element parentElement, String tag) { + if (parentElement == null) { + return null; + } + NodeList list = parentElement.getElementsByTagName(tag); + if (list == null || list.getLength() == 0) { + return null; + } + + return (Element) list.item(0); + } + + private static float getNumberValueFromString(String str) { + // String format like "16384 MiB", "0 %" + if (null != str) { + try { + return Float.parseFloat(str.split(" ")[0]); + } catch (NumberFormatException e) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "Exception when trying to parse GPU memory Usage, str=" + str, e); + } + return -1; + } + } + return -1; + } + + private static String sanitizeXmlInput(String xmlStr) throws IOException { + // Some platform has output like + // + // This could cause parse failure, so plan here is completely remove + // gpuDeviceInformations = new ArrayList<>(); + gpuDeviceInformation.setGpus(gpuDeviceInformations); + + // Parse XML and get each GPU device info. + NodeList nodeList = root.getElementsByTagName("gpu"); + for (int i = 0; i < nodeList.getLength(); i++) { + Element gpu = (Element) nodeList.item(i); + PerGpuDeviceInformation perGpuDeviceInformation = + new PerGpuDeviceInformation(); + + // Product Name + String productName = getFirstSubElementValueByTag(gpu, "product_name"); + if (null != productName) { + perGpuDeviceInformation.setProductName(productName); + } + + // UUID + String uuid = getFirstSubElementValueByTag(gpu, "uuid"); + if (null != uuid) { + perGpuDeviceInformation.setUuid(uuid); + } + + // Minor Number + perGpuDeviceInformation.setMinorNumber(convertStringToNumber( + getFirstSubElementValueByTag(gpu, "minor_number"))); + + // Handle Gpu Memory Usage + Element memUsageElement = getFirstElementByTag(gpu, "bar1_memory_usage"); + if (null != memUsageElement) { + int usedMem = (int) getNumberValueFromString( + getFirstSubElementValueByTag(memUsageElement, "total")); + int freeMem = (int) getNumberValueFromString( + getFirstSubElementValueByTag(memUsageElement, "free")); + if (usedMem >= 0) { + perGpuDeviceInformation.setUsedMemoryMiB(usedMem); + } + if (freeMem >= 0) { + perGpuDeviceInformation.setAvailMemoryMiB(freeMem); + } + } + + // Handle GPU utilization + Element utilization = getFirstElementByTag(gpu, "utilization"); + if (null != utilization) { + float gpuUtilPercentage = getNumberValueFromString( + getFirstSubElementValueByTag(utilization, "gpu_util")); + if (gpuUtilPercentage >= 0) { + perGpuDeviceInformation.setGpuUtilizationPercentage( + gpuUtilPercentage); + } + } + + // Handle Temp + Element temp = getFirstElementByTag(gpu, "temperature"); + if (null != temp) { + float curTemp = getNumberValueFromString( + getFirstSubElementValueByTag(temp, "gpu_temp")); + + // It may not possible that GPU temperature < -459.67 (F) or -273.15 (C) + // unless it is used in another universe. + if (curTemp > ABSOLUTE_ZERO) { + perGpuDeviceInformation.setCurrentGpuTemp(curTemp); + } + + float slowTemp = getNumberValueFromString( + getFirstSubElementValueByTag(temp, "gpu_temp_slow_threshold")); + if (slowTemp > ABSOLUTE_ZERO) { + perGpuDeviceInformation.setSlowThresholdGpuTemp(slowTemp); + } + + float maxGpuTemp = getNumberValueFromString( + getFirstSubElementValueByTag(temp, "gpu_temp_max_threshold")); + if (maxGpuTemp > ABSOLUTE_ZERO) { + perGpuDeviceInformation.setMaxGpuTemp(maxGpuTemp); + } + } + + // Check if GPU information is valid or not + if (perGpuDeviceInformation.getMinorNumber() < 0) { + throw new YarnException( + "Cannot parse minor number from GPU information, " + + "please double check. Xml=" + gpu.toString()); + } + + gpuDeviceInformations.add(perGpuDeviceInformation); + } + + return gpuDeviceInformation; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java new file mode 100644 index 00000000000..e3dd0b395c6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +/** + * Allocate GPU resources according to requirements + */ +public class GpuResourceAllocator { + final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); + private final static String RESOURCE_TYPE = "gpu"; + + private Set allowedGpuDevices = new TreeSet<>(); + private Map usedDevices = new TreeMap<>(); + private Context nmContext; + + public GpuResourceAllocator(Context ctx) { + this.nmContext = ctx; + } + + /** + * Contains allowed and denied devices with minor number. + * Denied devices will be useful for cgroups devices module to do blacklisting + */ + static class GpuAllocation { + private Set allowed = Collections.emptySet(); + private Set denied = Collections.emptySet(); + + GpuAllocation(Set allowed, Set denied) { + if (allowed != null) { + this.allowed = ImmutableSet.copyOf(allowed); + } + if (denied != null) { + this.denied = ImmutableSet.copyOf(denied); + } + } + + public Set getAllowedGPUs() { + return allowed; + } + + public Set getDeniedGPUs() { + return denied; + } + } + + /** + * Add GPU to allowed list + * @param minorNumber minor number of the GPU device. + */ + public synchronized void addGpu(int minorNumber) { + allowedGpuDevices.add(minorNumber); + } + + private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices, + ContainerId containerId) { + return "Failed to find enough GPUs, requestor=" + containerId + + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus=" + + getAvailableGpus(); + } + + @VisibleForTesting + public synchronized int getAvailableGpus() { + return allowedGpuDevices.size() - usedDevices.size(); + } + + public synchronized void recoverAssignedGpus(ContainerId containerId) + throws ResourceHandlerException { + Container c = nmContext.getContainers().get(containerId); + if (null == c) { + throw new ResourceHandlerException( + "This shouldn't happen, cannot find container with id=" + + containerId); + } + + for (Serializable deviceId : c.getResourceMappings().getAssignedResources( + RESOURCE_TYPE)){ + if (!(deviceId instanceof String)) { + throw new ResourceHandlerException( + "Trying to recover device id, however it" + + " is not String, this shouldn't happen"); + } + + int devId = Integer.parseInt((String)deviceId); + + // Make sure it is in allowed GPU device. + if (!allowedGpuDevices.contains(devId)) { + throw new ResourceHandlerException("Try to recover device id = " + devId + + " however it is not in allowed device list:" + StringUtils + .join(",", allowedGpuDevices)); + } + + // Make sure it is not occupied by anybody else + if (usedDevices.containsKey(devId)) { + throw new ResourceHandlerException("Try to recover device id = " + devId + + " however it is already assigned to container=" + usedDevices + .get(devId) + ", please double check what happened."); + } + + usedDevices.put(devId, containerId); + } + } + + /** + * Assign GPU to requestor + * @param numRequestedGpuDevices How many GPU to request + * @param containerId who is requesting the resources + * @return List of denied Gpus with minor numbers + * @throws ResourceHandlerException When failed to + */ + public synchronized GpuAllocation assignGpus(int numRequestedGpuDevices, + ContainerId containerId) throws ResourceHandlerException { + // Assign Gpus to container if requested some. + if (numRequestedGpuDevices > 0) { + if (numRequestedGpuDevices > getAvailableGpus()) { + throw new ResourceHandlerException( + getResourceHandlerExceptionMessage(numRequestedGpuDevices, + containerId)); + } + + Set assignedGpus = new HashSet<>(); + + for (int deviceNum : allowedGpuDevices) { + if (!usedDevices.containsKey(deviceNum)) { + usedDevices.put(deviceNum, containerId); + assignedGpus.add(deviceNum); + if (assignedGpus.size() == numRequestedGpuDevices) { + break; + } + } + } + + // Record in state store if we allocated anything + if (!assignedGpus.isEmpty()) { + List allocatedDevices = new ArrayList<>(); + for (int gpu : assignedGpus) { + allocatedDevices.add(String.valueOf(gpu)); + } + try { + nmContext.getNMStateStore().storeAssignedResources(containerId, + RESOURCE_TYPE, allocatedDevices); + } catch (IOException e) { + throw new ResourceHandlerException(e); + } + } + + return new GpuAllocation(assignedGpus, + Sets.difference(allowedGpuDevices, assignedGpus)); + } + return new GpuAllocation(null, allowedGpuDevices); + } + + /** + * Clean up all Gpus assigned to containerId + * @param containerId containerId + */ + public synchronized void cleanupAssignGpus(ContainerId containerId) { + Iterator> iter = + usedDevices.entrySet().iterator(); + while (iter.hasNext()) { + if (iter.next().getValue().equals(containerId)) { + iter.remove(); + } + } + } + + @VisibleForTesting + public synchronized Map getDeviceAllocationMapping() { + return new HashMap<>(usedDevices); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceDiscoverPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceDiscoverPlugin.java new file mode 100644 index 00000000000..9df2042de66 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceDiscoverPlugin.java @@ -0,0 +1,196 @@ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuResourceDiscoverPlugin { + public static final Logger LOG = LoggerFactory.getLogger( + GpuResourceDiscoverPlugin.class); + @VisibleForTesting + protected static final String BINARY_NAME = "nvidia-smi"; + // command should not run more than 10 sec. + private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + private static final int MAX_REPEATED_ERROR_ALLOWED = 10; + private static GpuResourceDiscoverPlugin instance; + + static { + instance = new GpuResourceDiscoverPlugin(); + } + + private Configuration conf = null; + private boolean featureEnabled = false; + private String pathOfGpuBinary = BINARY_NAME; + private Map environment = new HashMap<>(); + + private int numOfErrorExecutionSinceLastSucceed = 0; + + private void validateConfOrThrowException() throws YarnException { + if (conf == null) { + throw new YarnException("Please initialize (call setConf) before use " + + GpuResourceDiscoverPlugin.class.getSimpleName()); + } + + if (!featureEnabled) { + throw new YarnException("Feature doesn't enabled, please check [" + + YarnConfiguration.NM_GPU_RESOURCE_ENABLED + "] setting."); + } + } + + /** + * Get GPU device information from system. + * This need to be called after setConf. + * + * Please note that this only works on *NIX platform, so external caller + * need to make sure this. + * + * @return GpuDeviceInformation + * @throws YarnException when any error happens + */ + public synchronized GpuDeviceInformation getGpuDeviceInformation() + throws YarnException { + validateConfOrThrowException(); + + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + LOG.error("Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + " times, skip following exections."); + numOfErrorExecutionSinceLastSucceed++; + } + + String output; + try { + output = Shell.execCommand(environment, + new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); + GpuDeviceInformation info = GpuDeviceInformationParser.fromXmlString( + output); + numOfErrorExecutionSinceLastSucceed = 0; + return info; + } catch (IOException e) { + numOfErrorExecutionSinceLastSucceed++; + String msg = "Failed to execute " + pathOfGpuBinary; + LOG.warn(msg, e); + throw new YarnException(e); + } catch (YarnException e) { + numOfErrorExecutionSinceLastSucceed++; + String msg = "Failed to parse xml output"; + LOG.warn(msg, e); + throw e; + } + } + + /** + * Get list of minor device numbers of Gpu devices usable by YARN. + * + * @return List of minor device numbers of Gpu devices. + * @throws YarnException when any issue happens + */ + public synchronized List getMinorNumbersOfGpusUsableByYarn() + throws YarnException { + validateConfOrThrowException(); + + String allowedDevicesStr = conf.get( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + + List minorNumbers = new ArrayList<>(); + + if (allowedDevicesStr.equals( + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { + // Get gpu device information from system. + GpuDeviceInformation info = getGpuDeviceInformation(); + if (info.getGpus() != null) { + for (PerGpuDeviceInformation gpu : info.getGpus()) { + minorNumbers.add(gpu.getMinorNumber()); + } + } + } else{ + for (String s : allowedDevicesStr.split(",")) { + if (s.trim().length() > 0) { + minorNumbers.add(Integer.valueOf(s.trim())); + } + } + LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr); + } + + return minorNumbers; + } + + private void setDefaultEnvironment() { + // When this is not set, set environment to locate binary. + // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when + // launched by nvidia-docker. + environment.put("PATH", "$PATH:/usr/bin/:/bin/:/usr/local/nvidia/bin"); + } + + public synchronized void setConf(Configuration conf) { + this.conf = conf; + numOfErrorExecutionSinceLastSucceed = 0; + featureEnabled = conf.getBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, + YarnConfiguration.DEFAULT_NM_GPU_RESOURCE_ENABLED); + + if (featureEnabled) { + String dir = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, + YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); + if (dir.isEmpty()) { + setDefaultEnvironment(); + } else { + // Validate file existence + File binaryPath = new File(dir, BINARY_NAME); + if (!binaryPath.exists()) { + // When binary not exist, use default setting. + LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath() + + ", please double check [" + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + "] setting."); + setDefaultEnvironment(); + } else { + // Validated + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } + } + + // Try to discover GPU information once and print + try { + LOG.info("Trying to discover GPU information ..."); + GpuDeviceInformation info = getGpuDeviceInformation(); + LOG.info(info.toString()); + } catch (YarnException e) { + LOG.warn( + "Failed to discover GPU information from system, exception message:" + + e.getMessage() + ", continue ..."); + } + } + } + + public synchronized Configuration getConf() { + return conf; + } + + @VisibleForTesting + protected Map getEnvironmentToRunCommand() { + return environment; + } + + @VisibleForTesting + protected String getPathOfGpuBinary() { + return pathOfGpuBinary; + } + + public static GpuResourceDiscoverPlugin getInstance() { + return instance; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java new file mode 100644 index 00000000000..a6316ed7cac --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -0,0 +1,165 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class GpuResourceHandlerImpl implements ResourceHandler { + final static Log LOG = LogFactory + .getLog(GpuResourceHandlerImpl.class); + + private final String REQUEST_GPU_NUM_ENV_KEY = "REQUESTED_GPU_NUM"; + + // This will be used by container-executor to add necessary clis + public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus"; + public static final String CONTAINER_ID_CLI_OPTION = "--container_id"; + + private GpuResourceAllocator gpuAllocator; + private CGroupsHandler cGroupsHandler; + private PrivilegedOperationExecutor privilegedOperationExecutor; + + public GpuResourceHandlerImpl(CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor, + GpuResourceAllocator gpuResourceAllocator) { + this.cGroupsHandler = cGroupsHandler; + this.privilegedOperationExecutor = privilegedOperationExecutor; + gpuAllocator = gpuResourceAllocator; + } + + @Override + public List bootstrap(Configuration configuration) + throws ResourceHandlerException { + List minorNumbersOfUsableGpus; + try { + minorNumbersOfUsableGpus = GpuResourceDiscoverPlugin.getInstance() + .getMinorNumbersOfGpusUsableByYarn(); + } catch (YarnException e) { + LOG.error("Exception when trying to get usable GPU device", e); + throw new ResourceHandlerException(e); + } + + for (int minorNumber : minorNumbersOfUsableGpus) { + gpuAllocator.addGpu(minorNumber); + } + + // And initialize cgroups + this.cGroupsHandler.initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + + return null; + } + + private int getRequestedGpu(Container container) { + // TODO, use YARN-3926 after it merged + ContainerLaunchContext clc = container.getLaunchContext(); + Map envs = clc.getEnvironment(); + if (null != envs.get(REQUEST_GPU_NUM_ENV_KEY)) { + return Integer.parseInt(envs.get(REQUEST_GPU_NUM_ENV_KEY)); + } + return 0; + } + + @Override + public synchronized List preStart(Container container) + throws ResourceHandlerException { + String containerIdStr = container.getContainerId().toString(); + + int requestedGpu = getRequestedGpu(container); + + // Assign Gpus to container if requested some. + GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus( + requestedGpu, container.getContainerId()); + + // Create device cgroups for the container + cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + try { + // Execute c-e to setup GPU isolation before launch the container + PrivilegedOperation privilegedOperation = new PrivilegedOperation( + PrivilegedOperation.OperationType.GPU, Arrays + .asList(CONTAINER_ID_CLI_OPTION, containerIdStr)); + if (!allocation.getDeniedGPUs().isEmpty()) { + privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", allocation.getDeniedGPUs()))); + } + + privilegedOperationExecutor.executePrivilegedOperation( + privilegedOperation, true); + } catch (PrivilegedOperationException e) { + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + LOG.warn("Could not update cgroup for container", e); + throw new ResourceHandlerException(e); + } + + List ret = new ArrayList<>(); + ret.add(new PrivilegedOperation( + PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + PrivilegedOperation.CGROUP_ARG_PREFIX + + cGroupsHandler.getPathForCGroupTasks( + CGroupsHandler.CGroupController.DEVICES, containerIdStr))); + + return ret; + } + + @VisibleForTesting + public GpuResourceAllocator getGpuAllocator() { + return gpuAllocator; + } + + @Override + public List reacquireContainer(ContainerId containerId) + throws ResourceHandlerException { + gpuAllocator.recoverAssignedGpus(containerId); + return null; + } + + @Override + public synchronized List postComplete( + ContainerId containerId) throws ResourceHandlerException { + gpuAllocator.cleanupAssignGpus(containerId); + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerId.toString()); + return null; + } + + @Override + public List teardown() throws ResourceHandlerException { + return null; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/PerGpuDeviceInformation.java new file mode 100644 index 00000000000..25005fba382 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/PerGpuDeviceInformation.java @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Capture single GPU device information such as memory size, temperature, + * utilization. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class PerGpuDeviceInformation { + + String productName = "N/A"; + String uuid = "N/A"; + int minorNumber = -1; + + long usedMemoryMiB = -1; + long availMemoryMiB = -1; + + float gpuUtilization = -1; + + float currentGpuTemp = Float.MIN_VALUE; + float maxGpuTemp = Float.MIN_VALUE; + float slowThresholdGpuTemp = Float.MIN_VALUE; + + public String getUuid() { + return uuid; + } + + public void setUuid(String uuid) { + this.uuid = uuid; + } + + public String getProductName() { + return productName; + } + + public void setProductName(String productName) { + this.productName = productName; + } + + public int getMinorNumber() { + return minorNumber; + } + + public void setMinorNumber(int minorNumber) { + this.minorNumber = minorNumber; + } + + public long getUsedMemoryMiB() { + return usedMemoryMiB; + } + + public void setUsedMemoryMiB(long usedMemoryMiB) { + this.usedMemoryMiB = usedMemoryMiB; + } + + public long getAvailMemoryMiB() { + return availMemoryMiB; + } + + public void setAvailMemoryMiB(long availMemoryMiB) { + this.availMemoryMiB = availMemoryMiB; + } + + public float getGpuUtilization() { + return gpuUtilization; + } + + public void setGpuUtilizationPercentage(float gpuUtilization) { + this.gpuUtilization = gpuUtilization; + } + + public float getCurrentGpuTemp() { + return currentGpuTemp; + } + + public void setCurrentGpuTemp(float currentGpuTemp) { + this.currentGpuTemp = currentGpuTemp; + } + + public float getMaxGpuTemp() { + return maxGpuTemp; + } + + public void setMaxGpuTemp(float maxGpuTemp) { + this.maxGpuTemp = maxGpuTemp; + } + + public float getSlowThresholdGpuTemp() { + return slowThresholdGpuTemp; + } + + public void setSlowThresholdGpuTemp(float slowThresholdGpuTemp) { + this.slowThresholdGpuTemp = slowThresholdGpuTemp; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("ProductName=").append(productName).append(", MinorNumber=") + .append(minorNumber).append(", TotalMemory=").append( + availMemoryMiB + usedMemoryMiB).append("MiB"); + return sb.toString(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/scheduler/allocators/LocalResourceAllocators.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/scheduler/allocators/LocalResourceAllocators.java new file mode 100644 index 00000000000..d72acae5fac --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/scheduler/allocators/LocalResourceAllocators.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.scheduler.allocators; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; + +/** + * Manages local resource allocators such as GPU resource allocator. + */ +public class LocalResourceAllocators { + private static GpuResourceAllocator gpuResourceAllocator; + + public static GpuResourceAllocator getGpuResourceAllocator() { + return gpuResourceAllocator; + } + + public static void setGpuResourceAllocator(GpuResourceAllocator allocator) { + gpuResourceAllocator = allocator; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuDeviceInformationParser.java new file mode 100644 index 00000000000..7f692314573 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuDeviceInformationParser.java @@ -0,0 +1,20 @@ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +public class TestGpuDeviceInformationParser { + @Test + public void testParse() throws IOException, YarnException { + File f = new File("src/test/resources/nvidia-smi-sample-output.xml"); + String s = FileUtils.readFileToString(f, "UTF-8"); + + GpuDeviceInformation info = GpuDeviceInformationParser.fromXmlString(s); + Assert.assertEquals(2, info.getGpus().size()); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceDiscoverPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceDiscoverPlugin.java new file mode 100644 index 00000000000..42de7fd8bf4 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceDiscoverPlugin.java @@ -0,0 +1,108 @@ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +public class TestGpuResourceDiscoverPlugin { + private String getTestParentFolder() { + File f = new File( + "target/temp/" + TestGpuResourceDiscoverPlugin.class + .getName()); + return f.getAbsolutePath(); + } + + private void touchFile(File f) throws IOException { + new FileOutputStream(f).close(); + } + + @Before + public void before() throws IOException { + String folder = getTestParentFolder(); + File f = new File(folder); + FileUtils.deleteDirectory(f); + f.mkdirs(); + } + + @Test + public void testLinuxGpuResourceDiscoverPluginConfig() throws IOException { + // test case 1, check default setting. + Configuration conf = new Configuration(false); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + GpuResourceDiscoverPlugin plugin = + new GpuResourceDiscoverPlugin(); + plugin.setConf(conf); + Assert.assertEquals(GpuResourceDiscoverPlugin.BINARY_NAME, + plugin.getPathOfGpuBinary()); + Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); + Assert.assertTrue( + plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); + + // test case 2, check mandatory set path. + File fakeBinary = new File(getTestParentFolder(), + GpuResourceDiscoverPlugin.BINARY_NAME); + touchFile(fakeBinary); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + plugin = new GpuResourceDiscoverPlugin(); + plugin.setConf(conf); + Assert.assertEquals(fakeBinary.getAbsolutePath(), + plugin.getPathOfGpuBinary()); + Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); + + // test case 3, check mandatory set path, but binary doesn't exist so default + // path will be used. + fakeBinary.delete(); + plugin = new GpuResourceDiscoverPlugin(); + plugin.setConf(conf); + Assert.assertEquals(GpuResourceDiscoverPlugin.BINARY_NAME, + plugin.getPathOfGpuBinary()); + Assert.assertTrue( + plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); + } + + @Test + public void testGpuDiscover() throws YarnException { + // Since this is more of a performance unit test, only run if + // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true) + Assume.assumeTrue(Boolean.valueOf( + System.getProperty("runGpuDiscoverUnitTest"))); + Configuration conf = new Configuration(false); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + GpuResourceDiscoverPlugin plugin = + new GpuResourceDiscoverPlugin(); + plugin.setConf(conf); + GpuDeviceInformation info = plugin.getGpuDeviceInformation(); + + Assert.assertTrue(info.getGpus().size() > 0); + Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(), + info.getGpus().size()); + } + + @Test + public void getNumberOfUsableGpusFromConfig() throws YarnException { + Configuration conf = new Configuration(false); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4"); + GpuResourceDiscoverPlugin plugin = + new GpuResourceDiscoverPlugin(); + plugin.setConf(conf); + + List minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn(); + Assert.assertEquals(4, minorNumbers.size()); + + Assert.assertTrue(0 == minorNumbers.get(0)); + Assert.assertTrue(1 == minorNumbers.get(1)); + Assert.assertTrue(2 == minorNumbers.get(2)); + Assert.assertTrue(4 == minorNumbers.get(4)); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java new file mode 100644 index 00000000000..1fa0d9296ac --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -0,0 +1,335 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static org.mockito.Matchers.anyList; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestGpuResourceHandler { + private CGroupsHandler mockCGroupsHandler; + private PrivilegedOperationExecutor mockPrivilegedExecutor; + private GpuResourceHandlerImpl gpuResourceHandler; + private NMStateStoreService mockNMStateStore; + private ConcurrentHashMap runningContainersMap; + + @Before + public void setup() { + mockCGroupsHandler = mock(CGroupsHandler.class); + mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class); + mockNMStateStore = mock(NMStateStoreService.class); + + Context nmctx = mock(Context.class); + when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore); + runningContainersMap = new ConcurrentHashMap<>(); + when(nmctx.getContainers()).thenReturn(runningContainersMap); + GpuResourceAllocator gpuResourceAllocator = new GpuResourceAllocator(nmctx); + + gpuResourceHandler = new GpuResourceHandlerImpl(mockCGroupsHandler, + mockPrivilegedExecutor, gpuResourceAllocator); + } + + @Test + public void testBootStrap() throws ResourceHandlerException { + Configuration conf = new YarnConfiguration(); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0"); + + GpuResourceDiscoverPlugin.getInstance().setConf(conf); + + gpuResourceHandler.bootstrap(conf); + verify(mockCGroupsHandler, times(1)).initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + } + + private static ContainerId getContainerId(int id) { + return ContainerId.newContainerId(ApplicationAttemptId + .newInstance(ApplicationId.newInstance(1234L, 1), 1), id); + } + + private static Container mockContainerWithGpuRequest(int id, int numGpuRequest) { + Container c = mock(Container.class); + when(c.getContainerId()).thenReturn(getContainerId(id)); + ContainerLaunchContext clc = mock(ContainerLaunchContext.class); + Map envs = new HashMap<>(); + when(clc.getEnvironment()).thenReturn(envs); + envs.put("REQUESTED_GPU_NUM", String.valueOf(numGpuRequest)); + when(c.getLaunchContext()).thenReturn(clc); + return c; + } + + private void verifyDeniedDevices(ContainerId containerId, List deniedDevices) + throws ResourceHandlerException, PrivilegedOperationException { + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, containerId.toString()); + + if (null != deniedDevices && !deniedDevices.isEmpty()) { + verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation( + new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays + .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION, + containerId.toString(), + GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", deniedDevices))), true); + } + } + + @Test + public void testAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + GpuResourceDiscoverPlugin.getInstance().setConf(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(4)); + + /* Start container 2, asks 2 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Start container 3, ask 1 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1)); + + // devices = 0/1/3 will be blocked + verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3)); + + /* Start container 4, ask 0 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0)); + + // All devices will be blocked + verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4)); + + /* Release container-1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(3, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Release container-3, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(3)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString()); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationWithoutAllowedGpus() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, ""); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + GpuResourceDiscoverPlugin.getInstance().setConf(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 0 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0)); + verifyDeniedDevices(getContainerId(1), Collections.emptyList()); + + /* Start container 2, asks 1 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Release container 1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationStored() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + GpuResourceDiscoverPlugin.getInstance().setConf(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + + verify(mockNMStateStore).storeAssignedResources(getContainerId(1), "gpu", + Arrays.asList("0", "1", "3")); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(4)); + + /* Start container 2, ask 0 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 0)); + + verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4)); + + // Store assigned resource will not be invoked. + verify(mockNMStateStore, never()).storeAssignedResources( + eq(getContainerId(2)), eq("gpu"), anyList()); + } + + @Test + public void testRecoverResourceAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + conf.setBoolean(YarnConfiguration.NM_GPU_RESOURCE_ENABLED, true); + GpuResourceDiscoverPlugin.getInstance().setConf(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + Container nmContainer = mock(Container.class); + ResourceMappings rmap = new ResourceMappings(); + ResourceMappings.AssignedResources ar = + new ResourceMappings.AssignedResources(); + ar.updateAssignedResources(Arrays.asList("1", "3")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(1), nmContainer); + + // TEST CASE + // Reacquire container restore state of GPU Resource Allocator. + gpuResourceHandler.reacquireContainer(getContainerId(1)); + + Map deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is not in allowed list. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=5 is not in allowed list. + ar.updateAssignedResources(Arrays.asList("4", "5")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + boolean caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is already assigned. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=3 is already assigned + ar.updateAssignedResources(Arrays.asList("4", "3")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-output.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-output.xml new file mode 100644 index 00000000000..70e3e25fbb9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-output.xml @@ -0,0 +1,547 @@ + + + + + + + Wed Sep 6 21:52:51 2017 + 375.66 + 2 + + Tesla P100-PCIE-12GB + Tesla + Disabled + Disabled + Disabled + Disabled + 1920 + + N/A + N/A + + 0320717030197 + GPU-28604e81-21ec-cc48-6759-bf2648b22e16 + 0 + 86.00.3A.00.02 + No + 0x400 + 900-2H400-0110-030 + + H400.0202.00.01 + 1.1 + 4.1 + N/A + + + N/A + N/A + + + None + + + 04 + 00 + 0000 + 15F710DE + 0000:04:00.0 + 11DA10DE + + + 3 + 3 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 KB/s + 0 KB/s + + N/A + P0 + + Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 12193 MiB + 0 MiB + 12193 MiB + + + 16384 MiB + 2 MiB + 16382 MiB + + Default + + 0 % + 0 % + 0 % + 0 % + + + 0 + 0 + 0 ms + + + Enabled + Enabled + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + + 0 + + + + + 0 + + + + No + + + 31 C + 85 C + 82 C + + + P0 + Supported + 24.84 W + 250.00 W + 250.00 W + 250.00 W + 125.00 W + 250.00 W + + + 405 MHz + 405 MHz + 715 MHz + 835 MHz + + + 1189 MHz + 715 MHz + + + 1189 MHz + 715 MHz + + + 1328 MHz + 1328 MHz + 715 MHz + 1328 MHz + + + N/A + N/A + + + + 715 MHz + 1328 MHz + 1316 MHz + 1303 MHz + 1290 MHz + 1278 MHz + 1265 MHz + 1252 MHz + 1240 MHz + 1227 MHz + 1215 MHz + 1202 MHz + 1189 MHz + 1177 MHz + 1164 MHz + 1151 MHz + 1139 MHz + 1126 MHz + 1113 MHz + 1101 MHz + 1088 MHz + 1075 MHz + 1063 MHz + 1050 MHz + 1037 MHz + 1025 MHz + 1012 MHz + 999 MHz + 987 MHz + 974 MHz + 961 MHz + 949 MHz + 936 MHz + 923 MHz + 911 MHz + 898 MHz + 885 MHz + 873 MHz + 860 MHz + 847 MHz + 835 MHz + 822 MHz + 810 MHz + 797 MHz + 784 MHz + 772 MHz + 759 MHz + 746 MHz + 734 MHz + 721 MHz + 708 MHz + 696 MHz + 683 MHz + 670 MHz + 658 MHz + 645 MHz + 632 MHz + 620 MHz + 607 MHz + 594 MHz + 582 MHz + 569 MHz + 556 MHz + 544 MHz + + + + + + + + + + Tesla P100-PCIE-12GB + Tesla + Disabled + Disabled + Disabled + Disabled + 1920 + + N/A + N/A + + 0320717031755 + GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3 + 1 + 86.00.3A.00.02 + No + 0x8200 + 900-2H400-0110-030 + + H400.0202.00.01 + 1.1 + 4.1 + N/A + + + N/A + N/A + + + None + + + 82 + 00 + 0000 + 15F710DE + 0000:82:00.0 + 11DA10DE + + + 3 + 3 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 KB/s + 0 KB/s + + N/A + P0 + + Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 12193 MiB + 0 MiB + 12193 MiB + + + 16384 MiB + 2 MiB + 16382 MiB + + Default + + 0 % + 0 % + 0 % + 0 % + + + 0 + 0 + 0 ms + + + Enabled + Enabled + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + + 0 + + + + + 0 + + + + No + + + 34 C + 85 C + 82 C + + + P0 + Supported + 25.54 W + 250.00 W + 250.00 W + 250.00 W + 125.00 W + 250.00 W + + + 405 MHz + 405 MHz + 715 MHz + 835 MHz + + + 1189 MHz + 715 MHz + + + 1189 MHz + 715 MHz + + + 1328 MHz + 1328 MHz + 715 MHz + 1328 MHz + + + N/A + N/A + + + + 715 MHz + 1328 MHz + 1316 MHz + 1303 MHz + 1290 MHz + 1278 MHz + 1265 MHz + 1252 MHz + 1240 MHz + 1227 MHz + 1215 MHz + 1202 MHz + 1189 MHz + 1177 MHz + 1164 MHz + 1151 MHz + 1139 MHz + 1126 MHz + 1113 MHz + 1101 MHz + 1088 MHz + 1075 MHz + 1063 MHz + 1050 MHz + 1037 MHz + 1025 MHz + 1012 MHz + 999 MHz + 987 MHz + 974 MHz + 961 MHz + 949 MHz + 936 MHz + 923 MHz + 911 MHz + 898 MHz + 885 MHz + 873 MHz + 860 MHz + 847 MHz + 835 MHz + 822 MHz + 810 MHz + 797 MHz + 784 MHz + 772 MHz + 759 MHz + 746 MHz + 734 MHz + 721 MHz + 708 MHz + 696 MHz + 683 MHz + 670 MHz + 658 MHz + 645 MHz + 632 MHz + 620 MHz + 607 MHz + 594 MHz + 582 MHz + 569 MHz + 556 MHz + 544 MHz + + + + + + + + + \ No newline at end of file