diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..54ac40c766b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -19,8 +19,9 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -58,6 +59,8 @@ // command should not run more than 10 sec. private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_REPEATED_ERROR_ALLOWED = 10; + private static final String CORRECT_FORMAT_MESSAGE = "The correct format " + + "should be: index:minor_number"; private static GpuDiscoverer instance; static { @@ -79,6 +82,13 @@ private void validateConfOrThrowException() throws YarnException { } } + private String createIllegalFormatMessage(String device, + String allowedDevicesStr) { + return String.format("Illegal format of GPU device: %s, " + + "the config value as a whole was: '%s'! " + CORRECT_FORMAT_MESSAGE, + device, allowedDevicesStr); + } + /** * Get GPU device information from system. * This need to be called after initialize. @@ -153,48 +163,94 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() if (allowedDevicesStr.equals( YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { - // Get gpu device information from system. - if (null == lastDiscoveredGpuInformation) { - String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " - + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES - + ", however automatically discovering " - + "GPU information failed, please check NodeManager log for more" - + " details, as an alternative, admin can specify " - + YarnConfiguration.NM_GPU_ALLOWED_DEVICES - + " manually to enable GPU isolation."; - LOG.error(msg); - throw new YarnException(msg); + parseGpuDevicesFromAutoDiscoveredGpuInfo(gpuDevices); + } else { + gpuDevices = parseGpuDevicesFromUserDefinedValues(allowedDevicesStr); + } + + return gpuDevices; + } + + private void parseGpuDevicesFromAutoDiscoveredGpuInfo( + List gpuDevices) throws YarnException { + if (lastDiscoveredGpuInformation == null) { + String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + ", however automatically discovering " + + "GPU information failed, please check NodeManager log for more" + + " details, as an alternative, admin can specify " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " manually to enable GPU isolation."; + LOG.error(msg); + throw new YarnException(msg); + } + + if (lastDiscoveredGpuInformation.getGpus() != null) { + for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size(); + i++) { + List gpuInfos = + lastDiscoveredGpuInformation.getGpus(); + gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); } + } + } - if (lastDiscoveredGpuInformation.getGpus() != null) { - for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size(); - i++) { - List gpuInfos = - lastDiscoveredGpuInformation.getGpus(); - gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); + /** + * @param devices allowed devices coming from the config. + * Individual devices should be separated by commas. + *
The format of individual devices should be: + * <index:><minorNumber> + * @return List of GpuDevices + * @throws YarnException when a GPU device is defined as a duplicate. + * The first duplicate GPU device will be added to the exception message. + */ + private List parseGpuDevicesFromUserDefinedValues(String devices) + throws YarnException { + if (devices.trim().isEmpty()) { + throw new YarnException( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " is set to an empty value! Please specify " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + " to enable auto-discovery or " + + "please enter the GPU device IDs manually! " + + CORRECT_FORMAT_MESSAGE); + } + List gpuDevices = Lists.newArrayList(); + for (String device : devices.split(",")) { + if (device.trim().length() > 0) { + String[] splitByColon = device.trim().split(":"); + if (splitByColon.length != 2) { + throw new YarnException(createIllegalFormatMessage(device, devices)); } - } - } else{ - for (String s : allowedDevicesStr.split(",")) { - if (s.trim().length() > 0) { - String[] kv = s.trim().split(":"); - if (kv.length != 2) { - throw new YarnException( - "Illegal format, it should be index:minor_number format, now it=" - + s); - } - - gpuDevices.add( - new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1]))); + + GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices); + if (gpuDevices.contains(gpuDevice)) { + throw new YarnException("GPU device " + gpuDevice + + " has a duplicate definition! " + + "Please double-check the configuration " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + "! Current value of the configuration is: " + devices); } } - LOG.info("Allowed GPU devices:" + gpuDevices); } + LOG.info("Allowed GPU devices:" + gpuDevices); return gpuDevices; } - public synchronized void initialize(Configuration conf) throws YarnException { + private GpuDevice parseGpuDevice(String device, String[] splitByColon, + String allowedDevicesStr) throws YarnException { + try { + int index = Integer.parseInt(splitByColon[0]); + int minorNumber = Integer.parseInt(splitByColon[1]); + return new GpuDevice(index, minorNumber); + } catch (NumberFormatException e) { + throw new YarnException(createIllegalFormatMessage(device, + allowedDevicesStr), e); + } + } + + public synchronized void initialize(Configuration conf) { this.conf = conf; numOfErrorExecutionSinceLastSucceed = 0; String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..fa5d8791db6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -23,17 +23,26 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; -import org.junit.Assert; import org.junit.Assume; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.List; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + public class TestGpuDiscoverer { + @Rule + public ExpectedException exception = ExpectedException.none(); + private String getTestParentFolder() { File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); return f.getAbsolutePath(); @@ -61,10 +70,10 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); - Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); - Assert.assertTrue( + assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); + assertTrue( plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); // test case 2, check mandatory set path. @@ -74,18 +83,18 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(fakeBinary.getAbsolutePath(), + assertEquals(fakeBinary.getAbsolutePath(), plugin.getPathOfGpuBinary()); - Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); + assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); // test case 3, check mandatory set path, but binary doesn't exist so default // path will be used. fakeBinary.delete(); plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); - Assert.assertTrue( + assertTrue( plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); } @@ -100,42 +109,191 @@ public void testGpuDiscover() throws YarnException { plugin.initialize(conf); GpuDeviceInformation info = plugin.getGpuDeviceInformation(); - Assert.assertTrue(info.getGpus().size() > 0); - Assert.assertEquals(plugin.getGpusUsableByYarn().size(), + assertTrue(info.getGpus().size() > 0); + assertEquals(plugin.getGpusUsableByYarn().size(), info.getGpus().size()); } @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { + public void testGetNumberOfUsableGpusFromConfigSingleDevice() + throws YarnException { Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "1:2"); + + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + List usableGpuDevices = plugin.getGpusUsableByYarn(); + assertEquals(1, usableGpuDevices.size()); - // Illegal format + assertEquals(1, usableGpuDevices.get(0).getIndex()); + assertEquals(2, usableGpuDevices.get(0).getMinorNumber()); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIllegalFormat() + throws YarnException { + Configuration conf = new Configuration(false); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format"); GpuDiscoverer plugin = new GpuDiscoverer(); - try { - plugin.initialize(conf); - plugin.getGpusUsableByYarn(); - Assert.fail("Illegal format, should fail."); - } catch (YarnException e) { - // Expected - } - - // Valid format + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfig() throws YarnException { + Configuration conf = new Configuration(false); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); - plugin = new GpuDiscoverer(); + GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf); List usableGpuDevices = plugin.getGpusUsableByYarn(); - Assert.assertEquals(4, usableGpuDevices.size()); + assertEquals(4, usableGpuDevices.size()); + + assertEquals(0, usableGpuDevices.get(0).getIndex()); + assertEquals(0, usableGpuDevices.get(0).getMinorNumber()); + + assertEquals(1, usableGpuDevices.get(1).getIndex()); + assertEquals(1, usableGpuDevices.get(1).getMinorNumber()); + + assertEquals(2, usableGpuDevices.get(2).getIndex()); + assertEquals(2, usableGpuDevices.get(2).getMinorNumber()); + + assertEquals(3, usableGpuDevices.get(3).getIndex()); + assertEquals(4, usableGpuDevices.get(3).getMinorNumber()); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigDuplicateValues() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,1:1"); + + exception.expect(YarnException.class); + exception.expectMessage("GPU device " + new GpuDevice(1, 1) + + " has a duplicate definition!"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigDuplicateValues2() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,1:1,2:2"); + + exception.expect(YarnException.class); + exception.expectMessage("GPU device " + new GpuDevice(1, 1) + + " has a duplicate definition!"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingSpaces() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0 : 0,1 : 1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: 0 : 0"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingGibberish() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:@$1,1:1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: 0:@$1"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } - Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); - Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingLetters() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "x:0, 1:y"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: x:0"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } - Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); - Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + @Test + public void testGetNumberOfUsableGpusFromConfigWithoutIndexNumber() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, ":0, :1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: :0"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigEmptyString() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, ""); + + exception.expect(YarnException.class); + exception.expectMessage("set to an empty value"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutComma() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0 0:1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutComma2() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0.1 0.2"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutColonSeparator() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0.1,0.2"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); } }