diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index 0bedf63df14..96e7d41f9a7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -18,12 +18,15 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.ContainerId; -import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; @@ -36,10 +39,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - public class GpuResourceHandlerImpl implements ResourceHandler { final static Log LOG = LogFactory .getLog(GpuResourceHandlerImpl.class); @@ -64,20 +63,14 @@ public GpuResourceHandlerImpl(Context nmContext, @Override public List bootstrap(Configuration configuration) - throws ResourceHandlerException { - List usableGpus; - try { - usableGpus = GpuDiscoverer.getInstance() - .getGpusUsableByYarn(); - if (usableGpus == null || usableGpus.isEmpty()) { - String message = "GPU is enabled on the NodeManager, but couldn't find " - + "any usable GPU devices, please double check configuration."; - LOG.error(message); - throw new ResourceHandlerException(message); - } - } catch (YarnException e) { - LOG.error("Exception when trying to get usable GPU device", e); - throw new ResourceHandlerException(e); + throws ResourceHandlerException{ + List usableGpus = + GpuDiscoverer.getInstance().getGpusUsableByYarn(); + + if (usableGpus.isEmpty()) { + String message = "GPU is enabled on the NodeManager, but couldn't find " + + "any usable GPU devices, please double check configuration."; + LOG.warn(message); } for (GpuDevice gpu : usableGpus) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java index 0bc241dcf88..d077de0dec8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.slf4j.Logger; @@ -96,7 +97,7 @@ public synchronized void initialize(Context context) ResourcePlugin plugin = null; if (resourceName.equals(GPU_URI)) { - plugin = new GpuResourcePlugin(); + plugin = new GpuResourcePlugin(GpuDiscoverer.getInstance()); } else if (resourceName.equals(FPGA_URI)) { plugin = new FpgaResourcePlugin(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..511449aa8dd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -18,28 +18,26 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; -import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; @InterfaceAudience.Private @InterfaceStability.Unstable @@ -54,10 +52,6 @@ // launched by nvidia-docker. private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - - // command should not run more than 10 sec. - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; - private static final int MAX_REPEATED_ERROR_ALLOWED = 10; private static GpuDiscoverer instance; static { @@ -65,19 +59,11 @@ } private Configuration conf = null; + private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; private Map environment = new HashMap<>(); - private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); - private int numOfErrorExecutionSinceLastSucceed = 0; - GpuDeviceInformation lastDiscoveredGpuInformation = null; - - private void validateConfOrThrowException() throws YarnException { - if (conf == null) { - throw new YarnException("Please initialize (call initialize) before use " - + GpuDiscoverer.class.getSimpleName()); - } - } + private GpuDeviceInformation lastDiscoveredGpuInformation = null; /** * Get GPU device information from system. @@ -90,61 +76,18 @@ private void validateConfOrThrowException() throws YarnException { * @throws YarnException when any error happens */ public synchronized GpuDeviceInformation getGpuDeviceInformation() - throws YarnException { - validateConfOrThrowException(); - - if (null == pathOfGpuBinary) { - throw new YarnException( - "Failed to find GPU discovery executable, please double check " - + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); - } - - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { - String msg = - "Failed to execute GPU device information detection script for " - + MAX_REPEATED_ERROR_ALLOWED - + " times, skip following executions."; - LOG.error(msg); - throw new YarnException(msg); - } - - String output; - try { - output = Shell.execCommand(environment, - new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); - GpuDeviceInformation info = parser.parseXml(output); - numOfErrorExecutionSinceLastSucceed = 0; - lastDiscoveredGpuInformation = info; - return info; - } catch (IOException e) { - numOfErrorExecutionSinceLastSucceed++; - String msg = - "Failed to execute " + pathOfGpuBinary + " exception message:" + e - .getMessage() + ", continue ..."; - if (LOG.isDebugEnabled()) { - LOG.debug(msg); - } - throw new YarnException(e); - } catch (YarnException e) { - numOfErrorExecutionSinceLastSucceed++; - String msg = "Failed to parse xml output" + e.getMessage(); - if (LOG.isDebugEnabled()) { - LOG.warn(msg, e); - } - throw e; - } + throws YarnException, IOException { + lastDiscoveredGpuInformation = + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); + return lastDiscoveredGpuInformation; } /** * Get list of GPU devices usable by YARN. * * @return List of GPU devices - * @throws YarnException when any issue happens */ - public synchronized List getGpusUsableByYarn() - throws YarnException { - validateConfOrThrowException(); - + public synchronized List getGpusUsableByYarn() { String allowedDevicesStr = conf.get( YarnConfiguration.NM_GPU_ALLOWED_DEVICES, YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); @@ -162,8 +105,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() + " details, as an alternative, admin can specify " + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " manually to enable GPU isolation."; - LOG.error(msg); - throw new YarnException(msg); + LOG.warn(msg); + return gpuDevices; } if (lastDiscoveredGpuInformation.getGpus() != null) { @@ -175,17 +118,14 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() } } } else{ - for (String s : allowedDevicesStr.split(",")) { - if (s.trim().length() > 0) { - String[] kv = s.trim().split(":"); - if (kv.length != 2) { - throw new YarnException( - "Illegal format, it should be index:minor_number format, now it=" - + s); + for (String deviceId : allowedDevicesStr.split(",")) { + deviceId = deviceId.trim(); + if (deviceId.length() > 0) { + try { + addGpuDevice(deviceId, gpuDevices); + } catch (YarnException e) { + LOG.warn(e.getMessage()); } - - gpuDevices.add( - new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1]))); } } LOG.info("Allowed GPU devices:" + gpuDevices); @@ -194,9 +134,42 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() return gpuDevices; } - public synchronized void initialize(Configuration conf) throws YarnException { + private void addGpuDevice(String deviceId, List gpuDevices) + throws YarnException { + String[] indexAndMinorNumber = splitDeviceIdToParts(deviceId); + int index = parseGpuDeviceIdPart(indexAndMinorNumber, 0, "Index"); + int minorNumber = + parseGpuDeviceIdPart(indexAndMinorNumber, 1, "Minor number"); + gpuDevices.add(new GpuDevice(index, minorNumber)); + } + + private String[] splitDeviceIdToParts(String deviceId) throws YarnException { + String[] indexAndMinorNumber = deviceId.split(":"); + if (indexAndMinorNumber.length != 2) { + throw new YarnException(String.format( + "Illegal format of configuration parameter %s, it should be " + + "index:minor_number format. Current value is: %s", + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, deviceId)); + + } + return indexAndMinorNumber; + } + + private int parseGpuDeviceIdPart(String[] indexAndMinorNumber, int partIndex, + String partName) throws YarnException { + try { + return Integer.parseInt(indexAndMinorNumber[partIndex]); + } catch (NumberFormatException e) { + throw new YarnException(String.format( + "%s part of the allowed device is not a number, in setting %s", + partName, YarnConfiguration.NM_GPU_ALLOWED_DEVICES)); + } + } + + public synchronized void initialize(Configuration conf, + NvidiaBinaryHelper nvidiaBinaryHelper) { this.conf = conf; - numOfErrorExecutionSinceLastSucceed = 0; + this.nvidiaBinaryHelper = nvidiaBinaryHelper; String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); if (pathToExecutable.isEmpty()) { @@ -224,15 +197,20 @@ public synchronized void initialize(Configuration conf) throws YarnException { + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); } } else{ - // If path specified by user is a directory, use + // If path specified by user is a directory, use default binary file name if (binaryPath.isDirectory()) { binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + " under the directory, updated path-to-executable:" + binaryPath .getAbsolutePath()); + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } else if (getFileNameFromFile(binaryPath).equals(DEFAULT_BINARY_NAME)) { + // If path exists but file name is incorrect don't execute the file + LOG.warn( + "Please check the configuration value of {}. " + + "It should point to an {} binary.", + YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME); } - // Validated - pathOfGpuBinary = binaryPath.getAbsolutePath(); } // Try to discover GPU information once and print @@ -240,7 +218,7 @@ public synchronized void initialize(Configuration conf) throws YarnException { LOG.info("Trying to discover GPU information ..."); GpuDeviceInformation info = getGpuDeviceInformation(); LOG.info(info.toString()); - } catch (YarnException e) { + } catch (YarnException | IOException e) { String msg = "Failed to discover GPU information from system, exception message:" + e.getMessage() + " continue..."; @@ -248,6 +226,10 @@ public synchronized void initialize(Configuration conf) throws YarnException { } } + private String getFileNameFromFile(File binaryPath) { + return binaryPath.toPath().getFileName().toString(); + } + @VisibleForTesting protected Map getEnvironmentToRunCommand() { return environment; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 796eb25b431..2bb2e4c0eed 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -21,7 +21,6 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.slf4j.Logger; @@ -37,17 +36,15 @@ LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); @Override - public void updateConfiguredResource(Resource res) throws YarnException { + public void updateConfiguredResource(Resource res) { LOG.info("Initializing configured GPU resources for the NodeManager."); List usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn(); - if (null == usableGpus || usableGpus.isEmpty()) { + if (usableGpus.isEmpty()) { String message = "GPU is enabled, but couldn't find any usable GPUs on the " + "NodeManager."; - LOG.error(message); - // No gpu can be used by YARN. - throw new YarnException(message); + LOG.warn(message); } long nUsableGpus = usableGpus.size(); @@ -55,7 +52,7 @@ public void updateConfiguredResource(Resource res) throws YarnException { Map configuredResourceTypes = ResourceUtils.getResourceTypes(); if (!configuredResourceTypes.containsKey(GPU_URI)) { - throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " + LOG.warn("Found " + nUsableGpus + " usable GPUs, however " + GPU_URI + " resource-type is not configured inside" + " resource-types.xml, please configure it to enable GPU feature or" diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index e49d2f24bd9..32c131ea57a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -18,7 +18,9 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import org.apache.hadoop.yarn.api.records.ContainerId; +import java.io.IOException; +import java.util.List; + import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; @@ -32,19 +34,33 @@ import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; - -import java.util.List; -import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class GpuResourcePlugin implements ResourcePlugin { + + public static final int MAX_REPEATED_ERROR_ALLOWED = 10; + + private int numOfErrorExecutionSinceLastSucceed = 0; + + public static final Logger LOG = LoggerFactory.getLogger( + GpuResourcePlugin.class); + private GpuResourceHandlerImpl gpuResourceHandler = null; private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; private DockerCommandPlugin dockerCommandPlugin = null; + private GpuDiscoverer gpuDiscoverer; + + public GpuResourcePlugin(GpuDiscoverer gpuDiscoverer){ + this.gpuDiscoverer = gpuDiscoverer; + } @Override public synchronized void initialize(Context context) throws YarnException { + numOfErrorExecutionSinceLastSucceed = 0; resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); - GpuDiscoverer.getInstance().initialize(context.getConf()); + gpuDiscoverer.initialize(context.getConf(), + new NvidiaBinaryHelper()); dockerCommandPlugin = GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( context.getConf()); @@ -78,8 +94,21 @@ public DockerCommandPlugin getDockerCommandPluginInstance() { @Override public NMResourceInfo getNMResourceInfo() throws YarnException { - GpuDeviceInformation gpuDeviceInformation = - GpuDiscoverer.getInstance().getGpuDeviceInformation(); + checkErrorNumber(); + GpuDeviceInformation gpuDeviceInformation; + try{ + gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation(); + numOfErrorExecutionSinceLastSucceed = 0; + } catch (YarnException e) { + LOG.error(e.getMessage(), e); + numOfErrorExecutionSinceLastSucceed++; + throw e; + } catch (IOException e) { + numOfErrorExecutionSinceLastSucceed++; + LOG.error(e.getMessage(), e); + throw new YarnException(e); + } + GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); List totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); @@ -90,6 +119,17 @@ public NMResourceInfo getNMResourceInfo() throws YarnException { assignedGpuDevices); } + private void checkErrorNumber() throws YarnException { + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + String msg = + "Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + + " times, skip following executions."; + LOG.error(msg); + throw new YarnException(msg); + } + } + @Override public String toString() { return GpuResourcePlugin.class.getName(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java new file mode 100644 index 00000000000..1f890c51eae --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import java.io.IOException; +import java.util.HashMap; + +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; + +public class NvidiaBinaryHelper { + + // command should not run more than 10 sec. + private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + + private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + + /** + * @param pathOfGpuBinary The path of the binary + * @return the GpuDeviceInformation parsed from the nvidia-smi output + * @throws IOException if the binary output is not readable + * @throws YarnException if the pathOfGpuBinary is null, + * or the output parse failed + */ + public synchronized GpuDeviceInformation getGpuDeviceInformation( + String pathOfGpuBinary) throws IOException, YarnException { + + if (null == pathOfGpuBinary) { + throw new YarnException( + "Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); + } + + String output = Shell.execCommand(new HashMap<>(), + new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); + return parser.parseXml(output); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index fff9068442f..19fdf115236 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -18,6 +18,18 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; +import static org.mockito.ArgumentMatchers.*; +import static org.mockito.Mockito.*; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; @@ -37,6 +49,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; @@ -45,26 +58,6 @@ import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyList; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - public class TestGpuResourceHandler { private CGroupsHandler mockCGroupsHandler; private PrivilegedOperationExecutor mockPrivilegedExecutor; @@ -72,6 +65,8 @@ private NMStateStoreService mockNMStateStore; private ConcurrentHashMap runningContainersMap; + private NvidiaBinaryHelper nvidiaBinaryHelper; + @Before public void setup() { CustomResourceTypesConfigurationProvider. @@ -91,6 +86,7 @@ public void setup() { gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, mockPrivilegedExecutor); + nvidiaBinaryHelper = new NvidiaBinaryHelper(); } @Test @@ -98,7 +94,7 @@ public void testBootStrap() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); verify(mockCGroupsHandler, times(1)).initializeCGroupController( @@ -162,7 +158,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled) throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -251,7 +247,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -280,14 +276,9 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() public void testAllocationWithoutAllowedGpus() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); - try { - gpuResourceHandler.bootstrap(conf); - Assert.fail("Should fail because no GPU available"); - } catch (ResourceHandlerException e) { - // Expected because of no resource available - } + gpuResourceHandler.bootstrap(conf); /* Start container 1, asks 0 containers */ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0)); @@ -315,7 +306,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationStored() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -363,7 +354,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, mockPrivilegedExecutor); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuNULLStateResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -383,7 +374,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { public void testRecoverResourceAllocation() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..209e1ca8f2e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -18,22 +18,30 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import static org.mockito.ArgumentMatchers.any; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.junit.Assert; import org.junit.Assume; import org.junit.Before; import org.junit.Test; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.List; +import org.mockito.Mockito; public class TestGpuDiscoverer { + + private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper(); + private String getTestParentFolder() { File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); return f.getAbsolutePath(); @@ -60,7 +68,7 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { // test case 1, check default setting. Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); @@ -73,16 +81,17 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { touchFile(fakeBinary); conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); Assert.assertEquals(fakeBinary.getAbsolutePath(), plugin.getPathOfGpuBinary()); Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); - // test case 3, check mandatory set path, but binary doesn't exist so default + // test case 3, check mandatory set path, + // but binary doesn't exist so default // path will be used. fakeBinary.delete(); plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); Assert.assertTrue( @@ -90,14 +99,14 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { } @Test - public void testGpuDiscover() throws YarnException { + public void testGpuDiscover() throws YarnException, IOException { // Since this is more of a performance unit test, only run if // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true) Assume.assumeTrue( Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest"))); Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); GpuDeviceInformation info = plugin.getGpuDeviceInformation(); Assert.assertTrue(info.getGpus().size() > 0); @@ -106,36 +115,68 @@ public void testGpuDiscover() throws YarnException { } @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { + public void testMisconfiguredUsableGpus(){ Configuration conf = new Configuration(false); // Illegal format - conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3"); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, + "0:0,1:1,2:2,3,:4,6:,:,7:abc,abc:8,abc:abc"); GpuDiscoverer plugin = new GpuDiscoverer(); - try { - plugin.initialize(conf); - plugin.getGpusUsableByYarn(); - Assert.fail("Illegal format, should fail."); - } catch (YarnException e) { - // Expected - } - - // Valid format + + plugin.initialize(conf, binaryHelper); + List gpus = plugin.getGpusUsableByYarn(); + + Assert.assertEquals(3, gpus.size()); + } + + @Test + public void testConfiguredUsableGpus() { + + Configuration conf = new Configuration(false); + GpuDiscoverer plugin = new GpuDiscoverer(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); - plugin = new GpuDiscoverer(); - plugin.initialize(conf); + + plugin.initialize(conf, binaryHelper); List usableGpuDevices = plugin.getGpusUsableByYarn(); Assert.assertEquals(4, usableGpuDevices.size()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); - Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + Assert.assertEquals(0, usableGpuDevices.get(0).getIndex()); + Assert.assertEquals(1,usableGpuDevices.get(1).getIndex()); + Assert.assertEquals(2, usableGpuDevices.get(2).getIndex()); + Assert.assertEquals(3, usableGpuDevices.get(3).getIndex()); + + Assert.assertEquals(0, usableGpuDevices.get(0).getMinorNumber()); + Assert.assertEquals(1, usableGpuDevices.get(1).getMinorNumber()); + Assert.assertEquals(2, usableGpuDevices.get(2).getMinorNumber()); + Assert.assertEquals(4, usableGpuDevices.get(3).getMinorNumber()); + } - Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); - Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + @Test + public void testAutoDiscoveredGpus() throws IOException, YarnException { + int gpuMinorNumber = 19; + + NvidiaBinaryHelper mockBinaryHelper = + Mockito.mock(NvidiaBinaryHelper.class); + //Auto discovery is the default config value + Configuration conf = new Configuration(false); + GpuDiscoverer plugin = new GpuDiscoverer(); + + GpuDeviceInformation gpuDevice = new GpuDeviceInformation(); + PerGpuDeviceInformation perGpuDeviceInformation = + new PerGpuDeviceInformation(); + perGpuDeviceInformation.setMinorNumber(gpuMinorNumber); + gpuDevice.setGpus(Arrays.asList(perGpuDeviceInformation)); + Mockito.when(mockBinaryHelper.getGpuDeviceInformation(any())) + .thenReturn(gpuDevice); + + plugin.initialize(conf, mockBinaryHelper); + + List usableGpuDevices = plugin.getGpusUsableByYarn(); + Assert.assertEquals(1, usableGpuDevices.size()); + Assert.assertEquals(0, usableGpuDevices.get(0).getIndex()); + Assert.assertEquals(gpuMinorNumber, + usableGpuDevices.get(0).getMinorNumber()); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java new file mode 100644 index 00000000000..e71d9a6e6d1 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin.MAX_REPEATED_ERROR_ALLOWED; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; + +import java.io.IOException; + +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager; +import org.junit.Test; +import org.mockito.Mockito; + +public class TestGpuResourcePlugin { + + @Test + public void testDiscoveryFailure() throws YarnException, IOException { + + int numberOfGpuDiscovererFaultyCalls = MAX_REPEATED_ERROR_ALLOWED + 1; + int expectedFailuresCount = MAX_REPEATED_ERROR_ALLOWED; + + NodeManager.NMContext context = mock(NodeManager.NMContext.class); + GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class); + YarnConfiguration conf = new YarnConfiguration(); + + Mockito.when(context.getConf()).thenReturn(conf); + + Mockito.when(gpuDiscoverer.getGpuDeviceInformation()) + .thenThrow(new YarnException()); + + GpuResourcePlugin plugin = new GpuResourcePlugin(gpuDiscoverer); + + plugin.initialize(context); + + for (int i = 0; i <= numberOfGpuDiscovererFaultyCalls; i++) { + try { + plugin.getNMResourceInfo(); + } catch (YarnException e) { + // NOOP + } + } + + Mockito.verify(gpuDiscoverer, times(expectedFailuresCount)) + .getGpuDeviceInformation(); + } +}