diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index de66e7525e1..360ae1baab1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1649,9 +1649,6 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_GPU_PATH_TO_EXEC = NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; - @Private - public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; - /** * Settings to control which implementation of docker plugin for GPU will be * used. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..0f59ca5960d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -19,8 +19,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -93,12 +93,6 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { validateConfOrThrowException(); - if (null == pathOfGpuBinary) { - throw new YarnException( - "Failed to find GPU discovery executable, please double check " - + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); - } - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { String msg = "Failed to execute GPU device information detection script for " @@ -197,49 +191,13 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() public synchronized void initialize(Configuration conf) throws YarnException { this.conf = conf; numOfErrorExecutionSinceLastSucceed = 0; - String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, - YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); - if (pathToExecutable.isEmpty()) { - pathToExecutable = DEFAULT_BINARY_NAME; - } - - // Validate file existence - File binaryPath = new File(pathToExecutable); - - if (!binaryPath.exists()) { - // When binary not exist, use default setting. - boolean found = false; - for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { - binaryPath = new File(dir, DEFAULT_BINARY_NAME); - if (binaryPath.exists()) { - found = true; - pathOfGpuBinary = binaryPath.getAbsolutePath(); - break; - } - } - - if (!found) { - LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath() - + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC - + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); - } - } else{ - // If path specified by user is a directory, use - if (binaryPath.isDirectory()) { - binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); - LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME - + " under the directory, updated path-to-executable:" + binaryPath - .getAbsolutePath()); - } - // Validated - pathOfGpuBinary = binaryPath.getAbsolutePath(); - } + lookUpAutoDiscoveryBinary(conf); // Try to discover GPU information once and print try { LOG.info("Trying to discover GPU information ..."); GpuDeviceInformation info = getGpuDeviceInformation(); - LOG.info(info.toString()); + LOG.info("Discovered GPU information: " + info.toString()); } catch (YarnException e) { String msg = "Failed to discover GPU information from system, exception message:" @@ -248,6 +206,69 @@ public synchronized void initialize(Configuration conf) throws YarnException { } } + private void lookUpAutoDiscoveryBinary(Configuration conf) throws YarnException { + String configuredBinaryPath = conf.get( + YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME); + if (configuredBinaryPath.isEmpty()) { + configuredBinaryPath = DEFAULT_BINARY_NAME; + } + + File binaryPath; + File configuredBinaryFile = new File(configuredBinaryPath); + if (!configuredBinaryFile.exists()) { + binaryPath = lookupBinaryInDefaultDirs(); + } else if (configuredBinaryFile.isDirectory()) { + binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile); + } else { + binaryPath = configuredBinaryFile; + } + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } + + private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile) throws YarnException { + File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME); + if (!binaryPath.exists()) { + throw new YarnException("Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting. " + + "The setting points to a directory but no file found in the directory with name:" + + DEFAULT_BINARY_NAME); + } else { + LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + + " under the directory, updated path-to-executable:" + + binaryPath.getAbsolutePath()); + } + return binaryPath; + } + + private File lookupBinaryInDefaultDirs() throws YarnException { + final File lookedUpBinary = lookupBinaryInDefaultDirsInternal(); + if (lookedUpBinary == null) { + throw new YarnException("Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting. " + + "Also tried to find the executable in the default directories: " + + DEFAULT_BINARY_SEARCH_DIRS); + } + return lookedUpBinary; + } + + private File lookupBinaryInDefaultDirsInternal() { + Set triedBinaryPaths = Sets.newHashSet(); + for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { + File binaryPath = new File(dir, DEFAULT_BINARY_NAME); + if (binaryPath.exists()) { + return binaryPath; + } else { + triedBinaryPaths.add(binaryPath.getAbsolutePath()); + } + } + LOG.warn("Failed to locate GPU device discovery binary, tried paths: " + + triedBinaryPaths + "! Please double check the value of config " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + ". Using default binary: " + DEFAULT_BINARY_NAME); + + return null; + } + @VisibleForTesting protected Map getEnvironmentToRunCommand() { return environment; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..391fe34031a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -43,6 +43,14 @@ private void touchFile(File f) throws IOException { new FileOutputStream(f).close(); } + private File setupFakeBinary(Configuration conf) throws IOException { + File fakeBinary = new File(getTestParentFolder(), + GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + return fakeBinary; + } + @Before public void before() throws IOException { String folder = getTestParentFolder(); @@ -68,10 +76,7 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); // test case 2, check mandatory set path. - File fakeBinary = new File(getTestParentFolder(), - GpuDiscoverer.DEFAULT_BINARY_NAME); - touchFile(fakeBinary); - conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + File fakeBinary = setupFakeBinary(conf); plugin = new GpuDiscoverer(); plugin.initialize(conf); Assert.assertEquals(fakeBinary.getAbsolutePath(), @@ -106,7 +111,8 @@ public void testGpuDiscover() throws YarnException { } @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { + public void getNumberOfUsableGpusFromConfig() throws YarnException, + IOException { Configuration conf = new Configuration(false); // Illegal format @@ -121,6 +127,7 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException { } // Valid format + setupFakeBinary(conf); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); plugin = new GpuDiscoverer(); plugin.initialize(conf); @@ -138,4 +145,20 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException { Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); } + + @Test + public void testGpuBinaryIsANotExistingFile() { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla"); + GpuDiscoverer plugin = new GpuDiscoverer(); + try { + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + Assert.fail("Illegal format, should fail."); + } catch (YarnException e) { + String message = e.getMessage(); + Assert.assertTrue(message.startsWith("Failed to find GPU discovery executable, please double check")); + Assert.assertTrue(message.contains("Also tried to find the executable in the default directories:")); + } + } }