diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index 0bedf63df14..e5da3bd9e6c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -63,23 +63,17 @@ public GpuResourceHandlerImpl(Context nmContext, } @Override - public List bootstrap(Configuration configuration) - throws ResourceHandlerException { - List usableGpus; - try { - usableGpus = GpuDiscoverer.getInstance() - .getGpusUsableByYarn(); - if (usableGpus == null || usableGpus.isEmpty()) { - String message = "GPU is enabled on the NodeManager, but couldn't find " - + "any usable GPU devices, please double check configuration."; - LOG.error(message); - throw new ResourceHandlerException(message); - } - } catch (YarnException e) { - LOG.error("Exception when trying to get usable GPU device", e); - throw new ResourceHandlerException(e); + public List bootstrap(Configuration configuration) + throws ResourceHandlerException{ + List usableGpus = + GpuDiscoverer.getInstance().getGpusUsableByYarn(); + + if (usableGpus.isEmpty()) { + String message = "GPU is enabled on the NodeManager, but couldn't find " + + "any usable GPU devices, please double check configuration."; + LOG.warn(message); } - + for (GpuDevice gpu : usableGpus) { gpuAllocator.addGpu(gpu); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java index 0bc241dcf88..d077de0dec8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.slf4j.Logger; @@ -96,7 +97,7 @@ public synchronized void initialize(Context context) ResourcePlugin plugin = null; if (resourceName.equals(GPU_URI)) { - plugin = new GpuResourcePlugin(); + plugin = new GpuResourcePlugin(GpuDiscoverer.getInstance()); } else if (resourceName.equals(FPGA_URI)) { plugin = new FpgaResourcePlugin(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..6b74afe5578 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -18,28 +18,26 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; -import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; @InterfaceAudience.Private @InterfaceStability.Unstable @@ -54,10 +52,6 @@ // launched by nvidia-docker. private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - - // command should not run more than 10 sec. - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; - private static final int MAX_REPEATED_ERROR_ALLOWED = 10; private static GpuDiscoverer instance; static { @@ -65,19 +59,11 @@ } private Configuration conf = null; + private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; private Map environment = new HashMap<>(); - private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); - - private int numOfErrorExecutionSinceLastSucceed = 0; - GpuDeviceInformation lastDiscoveredGpuInformation = null; - - private void validateConfOrThrowException() throws YarnException { - if (conf == null) { - throw new YarnException("Please initialize (call initialize) before use " - + GpuDiscoverer.class.getSimpleName()); - } - } + + private GpuDeviceInformation lastDiscoveredGpuInformation = null; /** * Get GPU device information from system. @@ -90,60 +76,18 @@ private void validateConfOrThrowException() throws YarnException { * @throws YarnException when any error happens */ public synchronized GpuDeviceInformation getGpuDeviceInformation() - throws YarnException { - validateConfOrThrowException(); - - if (null == pathOfGpuBinary) { - throw new YarnException( - "Failed to find GPU discovery executable, please double check " - + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); - } - - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { - String msg = - "Failed to execute GPU device information detection script for " - + MAX_REPEATED_ERROR_ALLOWED - + " times, skip following executions."; - LOG.error(msg); - throw new YarnException(msg); - } - - String output; - try { - output = Shell.execCommand(environment, - new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); - GpuDeviceInformation info = parser.parseXml(output); - numOfErrorExecutionSinceLastSucceed = 0; - lastDiscoveredGpuInformation = info; - return info; - } catch (IOException e) { - numOfErrorExecutionSinceLastSucceed++; - String msg = - "Failed to execute " + pathOfGpuBinary + " exception message:" + e - .getMessage() + ", continue ..."; - if (LOG.isDebugEnabled()) { - LOG.debug(msg); - } - throw new YarnException(e); - } catch (YarnException e) { - numOfErrorExecutionSinceLastSucceed++; - String msg = "Failed to parse xml output" + e.getMessage(); - if (LOG.isDebugEnabled()) { - LOG.warn(msg, e); - } - throw e; - } + throws YarnException, IOException { + lastDiscoveredGpuInformation = + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); + return lastDiscoveredGpuInformation; } /** * Get list of GPU devices usable by YARN. * * @return List of GPU devices - * @throws YarnException when any issue happens */ - public synchronized List getGpusUsableByYarn() - throws YarnException { - validateConfOrThrowException(); + public synchronized List getGpusUsableByYarn() { String allowedDevicesStr = conf.get( YarnConfiguration.NM_GPU_ALLOWED_DEVICES, @@ -162,8 +106,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() + " details, as an alternative, admin can specify " + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " manually to enable GPU isolation."; - LOG.error(msg); - throw new YarnException(msg); + LOG.warn(msg); + return gpuDevices; } if (lastDiscoveredGpuInformation.getGpus() != null) { @@ -175,17 +119,14 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() } } } else{ - for (String s : allowedDevicesStr.split(",")) { - if (s.trim().length() > 0) { - String[] kv = s.trim().split(":"); - if (kv.length != 2) { - throw new YarnException( - "Illegal format, it should be index:minor_number format, now it=" - + s); + for (String deviceId : allowedDevicesStr.split(",")) { + deviceId = deviceId.trim(); + if (deviceId.length() > 0) { + try { + addGpuDevice(deviceId, gpuDevices); + } catch (YarnException e) { + LOG.warn(e.getMessage()); } - - gpuDevices.add( - new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1]))); } } LOG.info("Allowed GPU devices:" + gpuDevices); @@ -194,9 +135,42 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() return gpuDevices; } - public synchronized void initialize(Configuration conf) throws YarnException { + private void addGpuDevice(String deviceId, List gpuDevices) + throws YarnException { + String[] indexAndMinorNumber = splitDeviceIdToParts(deviceId); + int index = parseGpuDeviceIdPart(indexAndMinorNumber, 0, "Index"); + int minorNumber = + parseGpuDeviceIdPart(indexAndMinorNumber, 1, "Minor number"); + gpuDevices.add(new GpuDevice(index, minorNumber)); + } + + private String[] splitDeviceIdToParts(String deviceId) throws YarnException { + String[] indexAndMinorNumber = deviceId.split(":"); + if (indexAndMinorNumber.length != 2) { + throw new YarnException(String.format( + "Illegal format of configuration param %s, it should be " + + "index:minor_number format. Current value is: %s", + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, deviceId)); + + } + return indexAndMinorNumber; + } + + private int parseGpuDeviceIdPart(String[] indexAndMinorNumber, int partIndex, + String partName) throws YarnException { + try { + return Integer.parseInt(indexAndMinorNumber[partIndex]); + } catch (NumberFormatException e) { + throw new YarnException(String.format( + "%s part of the allowed device is not a number, in setting %s", + partName, YarnConfiguration.NM_GPU_ALLOWED_DEVICES)); + } + } + + public synchronized void initialize(Configuration conf, + NvidiaBinaryHelper nvidiaBinaryHelper) { this.conf = conf; - numOfErrorExecutionSinceLastSucceed = 0; + this.nvidiaBinaryHelper = nvidiaBinaryHelper; String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); if (pathToExecutable.isEmpty()) { @@ -224,15 +198,20 @@ public synchronized void initialize(Configuration conf) throws YarnException { + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); } } else{ - // If path specified by user is a directory, use + // If path specified by user is a directory, use default binary file name if (binaryPath.isDirectory()) { binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + " under the directory, updated path-to-executable:" + binaryPath .getAbsolutePath()); + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } else if (binaryPath.toPath().getFileName().toString().equals(DEFAULT_BINARY_NAME)){ + //If path exists but file name is incorrect don't execute the file + LOG.warn("Specified path is not pointing to an {} executable." + + " Please check [{}] setting.", + DEFAULT_BINARY_NAME, + YarnConfiguration.NM_GPU_PATH_TO_EXEC); } - // Validated - pathOfGpuBinary = binaryPath.getAbsolutePath(); } // Try to discover GPU information once and print @@ -240,7 +219,7 @@ public synchronized void initialize(Configuration conf) throws YarnException { LOG.info("Trying to discover GPU information ..."); GpuDeviceInformation info = getGpuDeviceInformation(); LOG.info(info.toString()); - } catch (YarnException e) { + } catch (YarnException | IOException e) { String msg = "Failed to discover GPU information from system, exception message:" + e.getMessage() + " continue..."; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 796eb25b431..2141399f5e0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -37,17 +37,15 @@ LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); @Override - public void updateConfiguredResource(Resource res) throws YarnException { + public void updateConfiguredResource(Resource res) { LOG.info("Initializing configured GPU resources for the NodeManager."); List usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn(); - if (null == usableGpus || usableGpus.isEmpty()) { + if (usableGpus.isEmpty()) { String message = "GPU is enabled, but couldn't find any usable GPUs on the " + "NodeManager."; - LOG.error(message); - // No gpu can be used by YARN. - throw new YarnException(message); + LOG.warn(message); } long nUsableGpus = usableGpus.size(); @@ -55,7 +53,7 @@ public void updateConfiguredResource(Resource res) throws YarnException { Map configuredResourceTypes = ResourceUtils.getResourceTypes(); if (!configuredResourceTypes.containsKey(GPU_URI)) { - throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " + LOG.warn("Found " + nUsableGpus + " usable GPUs, however " + GPU_URI + " resource-type is not configured inside" + " resource-types.xml, please configure it to enable GPU feature or" diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index e49d2f24bd9..d4b6fa06184 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -18,7 +18,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; @@ -32,19 +31,36 @@ import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.List; -import java.util.Map; public class GpuResourcePlugin implements ResourcePlugin { + + public static final int MAX_REPEATED_ERROR_ALLOWED = 10; + + private int numOfErrorExecutionSinceLastSucceed = 0; + + public static final Logger LOG = LoggerFactory.getLogger( + GpuResourcePlugin.class); + private GpuResourceHandlerImpl gpuResourceHandler = null; private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; private DockerCommandPlugin dockerCommandPlugin = null; + private GpuDiscoverer gpuDiscoverer; + + public GpuResourcePlugin(GpuDiscoverer gpuDiscoverer){ + this.gpuDiscoverer = gpuDiscoverer; + } @Override public synchronized void initialize(Context context) throws YarnException { + numOfErrorExecutionSinceLastSucceed = 0; resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); - GpuDiscoverer.getInstance().initialize(context.getConf()); + gpuDiscoverer.initialize(context.getConf(), + new NvidiaBinaryHelper()); dockerCommandPlugin = GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( context.getConf()); @@ -78,8 +94,21 @@ public DockerCommandPlugin getDockerCommandPluginInstance() { @Override public NMResourceInfo getNMResourceInfo() throws YarnException { - GpuDeviceInformation gpuDeviceInformation = - GpuDiscoverer.getInstance().getGpuDeviceInformation(); + checkErrorNumber(); + GpuDeviceInformation gpuDeviceInformation; + try{ + gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation(); + numOfErrorExecutionSinceLastSucceed = 0; + } catch (YarnException e) { + LOG.error(e.getMessage(), e); + numOfErrorExecutionSinceLastSucceed++; + throw e; + } catch (IOException e) { + numOfErrorExecutionSinceLastSucceed++; + LOG.error(e.getMessage(), e); + throw new YarnException(e); + } + GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); List totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); @@ -90,6 +119,17 @@ public NMResourceInfo getNMResourceInfo() throws YarnException { assignedGpuDevices); } + private void checkErrorNumber() throws YarnException { + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + String msg = + "Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + + " times, skip following executions."; + LOG.error(msg); + throw new YarnException(msg); + } + } + @Override public String toString() { return GpuResourcePlugin.class.getName(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index fff9068442f..b8351e65c3e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; @@ -71,6 +72,8 @@ private GpuResourceHandlerImpl gpuResourceHandler; private NMStateStoreService mockNMStateStore; private ConcurrentHashMap runningContainersMap; + + private NvidiaBinaryHelper nvidiaBinaryHelper; @Before public void setup() { @@ -91,6 +94,7 @@ public void setup() { gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, mockPrivilegedExecutor); + nvidiaBinaryHelper = new NvidiaBinaryHelper(); } @Test @@ -98,7 +102,7 @@ public void testBootStrap() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); verify(mockCGroupsHandler, times(1)).initializeCGroupController( @@ -162,7 +166,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled) throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -251,7 +255,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -280,14 +284,9 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() public void testAllocationWithoutAllowedGpus() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); - try { - gpuResourceHandler.bootstrap(conf); - Assert.fail("Should fail because no GPU available"); - } catch (ResourceHandlerException e) { - // Expected because of no resource available - } + gpuResourceHandler.bootstrap(conf); /* Start container 1, asks 0 containers */ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0)); @@ -315,7 +314,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationStored() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -363,7 +362,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, mockPrivilegedExecutor); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuNULLStateResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -383,7 +382,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { public void testRecoverResourceAllocation() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + GpuDiscoverer.getInstance().initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..d765507817e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -23,17 +23,25 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.junit.Assert; import org.junit.Assume; import org.junit.Before; import org.junit.Test; +import org.mockito.Mockito; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.util.Arrays; import java.util.List; +import static org.mockito.Matchers.any; + public class TestGpuDiscoverer { + + private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper(); + private String getTestParentFolder() { File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); return f.getAbsolutePath(); @@ -60,7 +68,7 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { // test case 1, check default setting. Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); @@ -73,7 +81,7 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { touchFile(fakeBinary); conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); Assert.assertEquals(fakeBinary.getAbsolutePath(), plugin.getPathOfGpuBinary()); Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); @@ -82,7 +90,7 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { // path will be used. fakeBinary.delete(); plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); Assert.assertTrue( @@ -90,52 +98,81 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { } @Test - public void testGpuDiscover() throws YarnException { + public void testGpuDiscover() throws YarnException, IOException { // Since this is more of a performance unit test, only run if // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true) Assume.assumeTrue( Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest"))); Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); GpuDeviceInformation info = plugin.getGpuDeviceInformation(); Assert.assertTrue(info.getGpus().size() > 0); Assert.assertEquals(plugin.getGpusUsableByYarn().size(), info.getGpus().size()); } - + @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { + public void testMisconfiguredUsableGpus(){ Configuration conf = new Configuration(false); // Illegal format - conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3"); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3,:4,6:,:,7:abc,abc:8,abc:abc"); GpuDiscoverer plugin = new GpuDiscoverer(); - try { - plugin.initialize(conf); - plugin.getGpusUsableByYarn(); - Assert.fail("Illegal format, should fail."); - } catch (YarnException e) { - // Expected - } - - // Valid format + + plugin.initialize(conf, binaryHelper); + List gpus = plugin.getGpusUsableByYarn(); + + Assert.assertEquals(3, gpus.size()); + } + + @Test + public void testConfiguredUsableGpus() { + + Configuration conf = new Configuration(false); + GpuDiscoverer plugin = new GpuDiscoverer(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); - plugin = new GpuDiscoverer(); - plugin.initialize(conf); + + plugin.initialize(conf, binaryHelper); List usableGpuDevices = plugin.getGpusUsableByYarn(); Assert.assertEquals(4, usableGpuDevices.size()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); - Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + Assert.assertEquals(0, usableGpuDevices.get(0).getIndex()); + Assert.assertEquals(1,usableGpuDevices.get(1).getIndex()); + Assert.assertEquals(2, usableGpuDevices.get(2).getIndex()); + Assert.assertEquals(3, usableGpuDevices.get(3).getIndex()); + + Assert.assertEquals(0, usableGpuDevices.get(0).getMinorNumber()); + Assert.assertEquals(1, usableGpuDevices.get(1).getMinorNumber()); + Assert.assertEquals(2, usableGpuDevices.get(2).getMinorNumber()); + Assert.assertEquals(4, usableGpuDevices.get(3).getMinorNumber()); + } + + @Test + public void testAutoDiscoveredGpus() throws IOException, YarnException { + int gpuMinorNumber = 19; + + NvidiaBinaryHelper mockBinaryHelper = Mockito.mock(NvidiaBinaryHelper.class); + //Auto discovery is the default config value + Configuration conf = new Configuration(false); + GpuDiscoverer plugin = new GpuDiscoverer(); + + GpuDeviceInformation gpuDevice = new GpuDeviceInformation(); + PerGpuDeviceInformation perGpuDeviceInformation = new PerGpuDeviceInformation(); + perGpuDeviceInformation.setMinorNumber(gpuMinorNumber); + gpuDevice.setGpus(Arrays.asList(perGpuDeviceInformation)); + Mockito.when(mockBinaryHelper.getGpuDeviceInformation(any())).thenReturn(gpuDevice); + + plugin.initialize(conf, mockBinaryHelper); + + List usableGpuDevices = plugin.getGpusUsableByYarn(); + Assert.assertEquals(1, usableGpuDevices.size()); + + Assert.assertEquals(0, usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); - Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + Assert.assertEquals(gpuMinorNumber, usableGpuDevices.get(0).getMinorNumber()); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java index 915f312b059..e71d9a6e6d1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java @@ -18,9 +18,48 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin.MAX_REPEATED_ERROR_ALLOWED; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; + +import java.io.IOException; + +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager; +import org.junit.Test; +import org.mockito.Mockito; + public class TestGpuResourcePlugin { - - public void testPlugin(){ - GpuResourcePlugin plugin = new GpuResourcePlugin(); + + @Test + public void testDiscoveryFailure() throws YarnException, IOException { + + int numberOfGpuDiscovererFaultyCalls = MAX_REPEATED_ERROR_ALLOWED + 1; + int expectedFailuresCount = MAX_REPEATED_ERROR_ALLOWED; + + NodeManager.NMContext context = mock(NodeManager.NMContext.class); + GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class); + YarnConfiguration conf = new YarnConfiguration(); + + Mockito.when(context.getConf()).thenReturn(conf); + + Mockito.when(gpuDiscoverer.getGpuDeviceInformation()) + .thenThrow(new YarnException()); + + GpuResourcePlugin plugin = new GpuResourcePlugin(gpuDiscoverer); + + plugin.initialize(context); + + for (int i = 0; i <= numberOfGpuDiscovererFaultyCalls; i++) { + try { + plugin.getNMResourceInfo(); + } catch (YarnException e) { + // NOOP + } + } + + Mockito.verify(gpuDiscoverer, times(expectedFailuresCount)) + .getGpuDeviceInformation(); } }