diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..63f47887337 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -19,8 +19,9 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -58,6 +59,8 @@ // command should not run more than 10 sec. private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_REPEATED_ERROR_ALLOWED = 10; + private static final String CORRECT_FORMAT_MESSAGE = "The correct format " + + "should be: index:minor_number"; private static GpuDiscoverer instance; static { @@ -79,6 +82,12 @@ private void validateConfOrThrowException() throws YarnException { } } + private String createIllegalFormatMessage(String device, String allowedDevicesStr) { + return String.format("Illegal format of GPU device: %s, " + + "the config value as a whole was: '%s'! " + CORRECT_FORMAT_MESSAGE, + device, allowedDevicesStr); + } + /** * Get GPU device information from system. * This need to be called after initialize. @@ -153,48 +162,96 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() if (allowedDevicesStr.equals( YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { - // Get gpu device information from system. - if (null == lastDiscoveredGpuInformation) { - String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " - + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES - + ", however automatically discovering " - + "GPU information failed, please check NodeManager log for more" - + " details, as an alternative, admin can specify " - + YarnConfiguration.NM_GPU_ALLOWED_DEVICES - + " manually to enable GPU isolation."; - LOG.error(msg); - throw new YarnException(msg); + parseGpuDevicesFromAutoDiscoveredGpuInfo(gpuDevices); + } else { + gpuDevices = parseGpuDevicesFromUserDefinedValues(allowedDevicesStr); + } + + return gpuDevices; + } + + private void parseGpuDevicesFromAutoDiscoveredGpuInfo( + List gpuDevices) throws YarnException { + if (lastDiscoveredGpuInformation == null) { + String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + ", however automatically discovering " + + "GPU information failed, please check NodeManager log for more" + + " details, as an alternative, admin can specify " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " manually to enable GPU isolation."; + LOG.error(msg); + throw new YarnException(msg); + } + + if (lastDiscoveredGpuInformation.getGpus() != null) { + for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size(); + i++) { + List gpuInfos = + lastDiscoveredGpuInformation.getGpus(); + gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); } + } + } - if (lastDiscoveredGpuInformation.getGpus() != null) { - for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size(); - i++) { - List gpuInfos = - lastDiscoveredGpuInformation.getGpus(); - gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); + /** + * @param allowedDevicesStr + * @return + * @throws YarnException when a GPU device is defined as a duplicate. + * The first duplicate GPU device will be added to the exception message. + */ + private List parseGpuDevicesFromUserDefinedValues( + String allowedDevicesStr) throws YarnException { + if (allowedDevicesStr.trim().isEmpty()) { + throw new YarnException( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " is set to an empty value! Please specify " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + " to enable auto-discovery or " + + "please enter the GPU device IDs manually! " + + CORRECT_FORMAT_MESSAGE); + } + List gpuDevices; + Set gpuDevicesSet = Sets.newTreeSet(); + for (String device : allowedDevicesStr.split(",")) { + if (device.trim().length() > 0) { + String[] splitByColon = device.trim().split(":"); + if (splitByColon.length != 2) { + throw new YarnException(createIllegalFormatMessage(device, + allowedDevicesStr)); } - } - } else{ - for (String s : allowedDevicesStr.split(",")) { - if (s.trim().length() > 0) { - String[] kv = s.trim().split(":"); - if (kv.length != 2) { - throw new YarnException( - "Illegal format, it should be index:minor_number format, now it=" - + s); - } - - gpuDevices.add( - new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1]))); + + GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, + allowedDevicesStr); + boolean notYetDefined = gpuDevicesSet.add(gpuDevice); + if (!notYetDefined) { + throw new YarnException("GPU device " + gpuDevice + + " has a duplicate definition! " + + "Please double-check the configuration " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + "! Current value of the configuration is: " + allowedDevicesStr); } } - LOG.info("Allowed GPU devices:" + gpuDevices); } + gpuDevices = Lists.newArrayList(gpuDevicesSet); + LOG.info("Allowed GPU devices:" + gpuDevices); return gpuDevices; } - public synchronized void initialize(Configuration conf) throws YarnException { + private GpuDevice parseGpuDevice(String device, String[] splitByColon, + String allowedDevicesStr) throws YarnException { + try { + int index = Integer.parseInt(splitByColon[0]); + int minorNumber = Integer.parseInt(splitByColon[1]); + return new GpuDevice(index, minorNumber); + } catch (NumberFormatException e) { + throw new YarnException(createIllegalFormatMessage(device, + allowedDevicesStr), e); + } + } + + public synchronized void initialize(Configuration conf) { this.conf = conf; numOfErrorExecutionSinceLastSucceed = 0; String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..d6c8fbd626a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -26,7 +26,9 @@ import org.junit.Assert; import org.junit.Assume; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.io.FileOutputStream; @@ -34,6 +36,9 @@ import java.util.List; public class TestGpuDiscoverer { + @Rule + public ExpectedException exception = ExpectedException.none(); + private String getTestParentFolder() { File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); return f.getAbsolutePath(); @@ -106,36 +111,183 @@ public void testGpuDiscover() throws YarnException { } @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { + public void testGetNumberOfUsableGpusFromConfigSingleDevice() throws YarnException { Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "1:2"); + + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + List usableGpuDevices = plugin.getGpusUsableByYarn(); + Assert.assertEquals(1, usableGpuDevices.size()); + + Assert.assertEquals(1, usableGpuDevices.get(0).getIndex()); + Assert.assertEquals(2, usableGpuDevices.get(0).getMinorNumber()); + } - // Illegal format + @Test + public void testGetNumberOfUsableGpusFromConfigIllegalFormat() throws YarnException { + Configuration conf = new Configuration(false); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format"); GpuDiscoverer plugin = new GpuDiscoverer(); - try { - plugin.initialize(conf); - plugin.getGpusUsableByYarn(); - Assert.fail("Illegal format, should fail."); - } catch (YarnException e) { - // Expected - } - - // Valid format + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfig() throws YarnException { + Configuration conf = new Configuration(false); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); - plugin = new GpuDiscoverer(); + GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf); List usableGpuDevices = plugin.getGpusUsableByYarn(); Assert.assertEquals(4, usableGpuDevices.size()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); - Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + Assert.assertEquals(0, usableGpuDevices.get(0).getIndex()); + Assert.assertEquals(0, usableGpuDevices.get(0).getMinorNumber()); + + Assert.assertEquals(1, usableGpuDevices.get(1).getIndex()); + Assert.assertEquals(1, usableGpuDevices.get(1).getMinorNumber()); + + Assert.assertEquals(2, usableGpuDevices.get(2).getIndex()); + Assert.assertEquals(2, usableGpuDevices.get(2).getMinorNumber()); + + Assert.assertEquals(3, usableGpuDevices.get(3).getIndex()); + Assert.assertEquals(4, usableGpuDevices.get(3).getMinorNumber()); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigDuplicateValues() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,1:1"); + + exception.expect(YarnException.class); + exception.expectMessage("GPU device " + new GpuDevice(1, 1) + + " has a duplicate definition!"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigDuplicateValues2() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,1:1,2:2"); + + exception.expect(YarnException.class); + exception.expectMessage("GPU device " + new GpuDevice(1, 1) + + " has a duplicate definition!"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingSpaces() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0 : 0,1 : 1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: 0 : 0"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingGibberish() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:@$1,1:1"); - Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); - Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: 0:@$1"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingLetters() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "x:0, 1:y"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: x:0"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigWithoutIndexNumber() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, ":0, :1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device: :0"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigEmptyString() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, ""); + + exception.expect(YarnException.class); + exception.expectMessage("set to an empty value"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutComma() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0 0:1"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutComma2() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0.1 0.2"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutColonSeparator() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0.1,0.2"); + + exception.expect(YarnException.class); + exception.expectMessage("Illegal format of GPU device"); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); } }