diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 04d0fd14bec..f8f262d9947 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1659,9 +1659,6 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_GPU_PATH_TO_EXEC = NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; - @Private - public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; - /** * Settings to control which implementation of docker plugin for GPU will be * used. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..ac92b6cd53b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -19,8 +19,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -93,12 +93,6 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { validateConfOrThrowException(); - if (null == pathOfGpuBinary) { - throw new YarnException( - "Failed to find GPU discovery executable, please double check " - + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); - } - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { String msg = "Failed to execute GPU device information detection script for " @@ -180,8 +174,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() String[] kv = s.trim().split(":"); if (kv.length != 2) { throw new YarnException( - "Illegal format, it should be index:minor_number format, now it=" - + s); + "Illegal format, it should be index:minor_number format, " + + "now it is: " + s); } gpuDevices.add( @@ -197,49 +191,13 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() public synchronized void initialize(Configuration conf) throws YarnException { this.conf = conf; numOfErrorExecutionSinceLastSucceed = 0; - String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, - YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); - if (pathToExecutable.isEmpty()) { - pathToExecutable = DEFAULT_BINARY_NAME; - } - - // Validate file existence - File binaryPath = new File(pathToExecutable); - - if (!binaryPath.exists()) { - // When binary not exist, use default setting. - boolean found = false; - for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { - binaryPath = new File(dir, DEFAULT_BINARY_NAME); - if (binaryPath.exists()) { - found = true; - pathOfGpuBinary = binaryPath.getAbsolutePath(); - break; - } - } - - if (!found) { - LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath() - + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC - + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); - } - } else{ - // If path specified by user is a directory, use - if (binaryPath.isDirectory()) { - binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); - LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME - + " under the directory, updated path-to-executable:" + binaryPath - .getAbsolutePath()); - } - // Validated - pathOfGpuBinary = binaryPath.getAbsolutePath(); - } + lookUpAutoDiscoveryBinary(conf); // Try to discover GPU information once and print try { LOG.info("Trying to discover GPU information ..."); GpuDeviceInformation info = getGpuDeviceInformation(); - LOG.info(info.toString()); + LOG.info("Discovered GPU information: " + info.toString()); } catch (YarnException e) { String msg = "Failed to discover GPU information from system, exception message:" @@ -248,6 +206,71 @@ public synchronized void initialize(Configuration conf) throws YarnException { } } + private void lookUpAutoDiscoveryBinary(Configuration config) + throws YarnException { + String configuredBinaryPath = config.get( + YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME); + if (configuredBinaryPath.isEmpty()) { + configuredBinaryPath = DEFAULT_BINARY_NAME; + } + + File binaryPath; + File configuredBinaryFile = new File(configuredBinaryPath); + if (!configuredBinaryFile.exists()) { + binaryPath = lookupBinaryInDefaultDirs(); + } else if (configuredBinaryFile.isDirectory()) { + binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile); + } else { + binaryPath = configuredBinaryFile; + } + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } + + private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile) + throws YarnException { + File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME); + if (!binaryPath.exists()) { + throw new YarnException("Failed to find GPU discovery executable, " + + "please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + + " setting. The setting points to a directory but " + + "no file found in the directory with name:" + DEFAULT_BINARY_NAME); + } else { + LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + + " under the directory, updated path-to-executable:" + + binaryPath.getAbsolutePath()); + } + return binaryPath; + } + + private File lookupBinaryInDefaultDirs() throws YarnException { + final File lookedUpBinary = lookupBinaryInDefaultDirsInternal(); + if (lookedUpBinary == null) { + throw new YarnException("Failed to find GPU discovery executable, " + + "please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + " setting. Also tried to find the executable " + + "in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS); + } + return lookedUpBinary; + } + + private File lookupBinaryInDefaultDirsInternal() { + Set triedBinaryPaths = Sets.newHashSet(); + for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { + File binaryPath = new File(dir, DEFAULT_BINARY_NAME); + if (binaryPath.exists()) { + return binaryPath; + } else { + triedBinaryPaths.add(binaryPath.getAbsolutePath()); + } + } + LOG.warn("Failed to locate GPU device discovery binary, tried paths: " + + triedBinaryPaths + "! Please double check the value of config " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + ". Using default binary: " + DEFAULT_BINARY_NAME); + + return null; + } + @VisibleForTesting protected Map getEnvironmentToRunCommand() { return environment; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index fff9068442f..db94e99b929 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; +import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; @@ -35,16 +36,20 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.TestFpgaDiscoverer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider; +import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -71,9 +76,42 @@ private GpuResourceHandlerImpl gpuResourceHandler; private NMStateStoreService mockNMStateStore; private ConcurrentHashMap runningContainersMap; + private File testDataDirectory; + + public void createTestDataDirectory() throws IOException { + String testDirectoryPath = getTestParentDirectory(); + testDataDirectory = new File(testDirectoryPath); + FileUtils.deleteDirectory(testDataDirectory); + testDataDirectory.mkdirs(); + } + + private String getTestParentDirectory() { + File f = new File("target/temp/" + TestFpgaDiscoverer.class.getName()); + return f.getAbsolutePath(); + } + + private void touchFile(File f) throws IOException { + new FileOutputStream(f).close(); + } + + private Configuration createDefaultConfig() throws IOException { + Configuration conf = new YarnConfiguration(); + File fakeBinary = setupFakeGpuDiscoveryBinary(); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, + fakeBinary.getAbsolutePath()); + return conf; + } + + private File setupFakeGpuDiscoveryBinary() throws IOException { + File fakeBinary = new File(getTestParentDirectory() + "/fake-nvidia-smi"); + touchFile(fakeBinary); + return fakeBinary; + } @Before - public void setup() { + public void setup() throws IOException { + createTestDataDirectory(); + CustomResourceTypesConfigurationProvider. initResourceTypes(ResourceInformation.GPU_URI); @@ -93,9 +131,14 @@ public void setup() { mockPrivilegedExecutor); } + @After + public void cleanupTestFiles() throws IOException { + FileUtils.deleteDirectory(testDataDirectory); + } + @Test public void testBootStrap() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); GpuDiscoverer.getInstance().initialize(conf); @@ -160,7 +203,7 @@ private void verifyDeniedDevices(ContainerId containerId, private void commonTestAllocation(boolean dockerContainerEnabled) throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); GpuDiscoverer.getInstance().initialize(conf); @@ -249,7 +292,7 @@ public void testAllocation() throws Exception { @Test public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); GpuDiscoverer.getInstance().initialize(conf); @@ -278,7 +321,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() @Test public void testAllocationWithoutAllowedGpus() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); GpuDiscoverer.getInstance().initialize(conf); @@ -313,7 +356,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception { @Test public void testAllocationStored() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); GpuDiscoverer.getInstance().initialize(conf); @@ -352,7 +395,7 @@ public void testAllocationStored() throws Exception { public void testAllocationStoredWithNULLStateStore() throws Exception { NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class); - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); Context nmnctx = mock(Context.class); @@ -381,7 +424,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { @Test public void testRecoverResourceAllocation() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); GpuDiscoverer.getInstance().initialize(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..391fe34031a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -43,6 +43,14 @@ private void touchFile(File f) throws IOException { new FileOutputStream(f).close(); } + private File setupFakeBinary(Configuration conf) throws IOException { + File fakeBinary = new File(getTestParentFolder(), + GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + return fakeBinary; + } + @Before public void before() throws IOException { String folder = getTestParentFolder(); @@ -68,10 +76,7 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); // test case 2, check mandatory set path. - File fakeBinary = new File(getTestParentFolder(), - GpuDiscoverer.DEFAULT_BINARY_NAME); - touchFile(fakeBinary); - conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + File fakeBinary = setupFakeBinary(conf); plugin = new GpuDiscoverer(); plugin.initialize(conf); Assert.assertEquals(fakeBinary.getAbsolutePath(), @@ -106,7 +111,8 @@ public void testGpuDiscover() throws YarnException { } @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { + public void getNumberOfUsableGpusFromConfig() throws YarnException, + IOException { Configuration conf = new Configuration(false); // Illegal format @@ -121,6 +127,7 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException { } // Valid format + setupFakeBinary(conf); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); plugin = new GpuDiscoverer(); plugin.initialize(conf); @@ -138,4 +145,20 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException { Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); } + + @Test + public void testGpuBinaryIsANotExistingFile() { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla"); + GpuDiscoverer plugin = new GpuDiscoverer(); + try { + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + Assert.fail("Illegal format, should fail."); + } catch (YarnException e) { + String message = e.getMessage(); + Assert.assertTrue(message.startsWith("Failed to find GPU discovery executable, please double check")); + Assert.assertTrue(message.contains("Also tried to find the executable in the default directories:")); + } + } }