diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 6e3cf1315ce..a2467ec72a1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -41,6 +41,8 @@
import java.util.Map;
import java.util.Set;
+import static org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE;
+
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class GpuDiscoverer {
@@ -70,7 +72,7 @@
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
private int numOfErrorExecutionSinceLastSucceed = 0;
- GpuDeviceInformation lastDiscoveredGpuInformation = null;
+ private GpuDeviceInformation lastDiscoveredGpuInformation = null;
private void validateConfOrThrowException() throws YarnException {
if (conf == null) {
@@ -79,6 +81,27 @@ private void validateConfOrThrowException() throws YarnException {
}
}
+ private String getErrorMessageOfScriptExecution(String msg) {
+ return getFailedToExecuteScriptMessage() +
+ "! Exception message: " + msg;
+ }
+
+ private String getErrorMessageOfScriptExecutionThresholdReached() {
+ return getFailedToExecuteScriptMessage() + " for " +
+ MAX_REPEATED_ERROR_ALLOWED + " times, " +
+ "skipping following executions!";
+ }
+
+ private String getFailedToExecuteScriptMessage() {
+ return "Failed to execute " + GPU_SCRIPT_REFERENCE +
+ " (" + pathOfGpuBinary + ")";
+ }
+
+ private String getFailedToParseErrorMessage(String msg) {
+ return "Failed to parse XML output of " + GPU_SCRIPT_REFERENCE
+ + "( " + pathOfGpuBinary + ")" + msg;
+ }
+
/**
* Get GPU device information from system.
* This need to be called after initialize.
@@ -100,10 +123,7 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
}
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
- String msg =
- "Failed to execute GPU device information detection script for "
- + MAX_REPEATED_ERROR_ALLOWED
- + " times, skip following executions.";
+ final String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg);
throw new YarnException(msg);
}
@@ -118,16 +138,14 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
return info;
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
- String msg =
- "Failed to execute " + pathOfGpuBinary + " exception message:" + e
- .getMessage() + ", continue ...";
+ final String msg = getErrorMessageOfScriptExecution(e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.debug(msg);
}
- throw new YarnException(e);
+ throw new YarnException(msg, e);
} catch (YarnException e) {
numOfErrorExecutionSinceLastSucceed++;
- String msg = "Failed to parse xml output" + e.getMessage();
+ String msg = getFailedToParseErrorMessage(e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.warn(msg, e);
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
index 1bd92f63a88..894cb70ca61 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -43,6 +43,8 @@
public class GpuDeviceInformationParser {
private static final Logger LOG = LoggerFactory.getLogger(
GpuDeviceInformationParser.class);
+ public static final String GPU_SCRIPT_REFERENCE = "GPU device detection " +
+ "script";
private Unmarshaller unmarshaller = null;
private XMLReader xmlReader = null;
@@ -70,7 +72,9 @@ public synchronized GpuDeviceInformation parseXml(String xmlContent)
try {
init();
} catch (SAXException | ParserConfigurationException | JAXBException e) {
- LOG.error("Exception while initialize parser", e);
+ String msg = "Exception while initializing parser for " +
+ GPU_SCRIPT_REFERENCE;
+ LOG.error(msg, e);
throw new YarnException(e);
}
}
@@ -80,8 +84,10 @@ public synchronized GpuDeviceInformation parseXml(String xmlContent)
try {
return (GpuDeviceInformation) unmarshaller.unmarshal(source);
} catch (JAXBException e) {
- LOG.error("Exception while parsing xml", e);
- throw new YarnException(e);
+ String msg = "Failed to parse XML output of " +
+ GPU_SCRIPT_REFERENCE + "!";
+ LOG.error(msg, e);
+ throw new YarnException(msg, e);
}
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index 4abb633a69a..f1ec032db91 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -20,19 +20,27 @@
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
+import org.junit.Rule;
import org.junit.Test;
+import org.junit.rules.ExpectedException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.PrintWriter;
import java.util.List;
+import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.not;
+
public class TestGpuDiscoverer {
private String getTestParentFolder() {
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
@@ -43,14 +51,54 @@ private void touchFile(File f) throws IOException {
new FileOutputStream(f).close();
}
+ @Rule
+ public ExpectedException expected = ExpectedException.none();
+
@Before
public void before() throws IOException {
+ assumeNotWindows();
String folder = getTestParentFolder();
File f = new File(folder);
FileUtils.deleteDirectory(f);
f.mkdirs();
}
+ private static void createNvidiaSmiScript(File file) throws IOException {
+ PrintWriter fileWriter = new PrintWriter(file);
+ if (Shell.WINDOWS) {
+ fileWriter.println("@echo ^^");
+ } else {
+ fileWriter.write("#!/bin/bash\n\n");
+ fileWriter.write("echo ''");
+ }
+ fileWriter.close();
+ }
+
+ private static void createwFaultyNvidiaSmiScript(File file)
+ throws IOException {
+ PrintWriter fileWriter = new PrintWriter(file);
+ if (Shell.WINDOWS) {
+ fileWriter.println("@echo <<");
+ } else {
+ fileWriter.write("#!/bin/bash\n\n");
+ fileWriter.write("echo <<'");
+ }
+ fileWriter.close();
+ }
+
+ private static void createwNvidiaSmiScriptWithInvalidXml(File file)
+ throws IOException {
+ PrintWriter fileWriter = new PrintWriter(file);
+ if (Shell.WINDOWS) {
+ fileWriter.println("@echo ^^");
+ } else {
+ fileWriter.write("#!/bin/bash\n\n");
+ fileWriter.write("echo ''");
+ }
+ fileWriter.close();
+ }
+
+
@Test
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
// Only run this on demand.
@@ -89,6 +137,157 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
}
+ @Test
+ public void testGetGpuDeviceInformationValidNvidiaSmiScript()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand("u+x", false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ GpuDeviceInformation result =
+ plugin.getGpuDeviceInformation();
+ Assert.assertNotNull(result);
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationFakeNvidiaSmiScriptConsecutiveRun()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand("u+x", false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ for (int i = 0; i < 5; i++) {
+ GpuDeviceInformation result = plugin.getGpuDeviceInformation();
+ Assert.assertNotNull(result);
+ }
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationFaultyNvidiaSmiScript()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary =
+ new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createwFaultyNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand("u+x", false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ expected.expect(YarnException.class);
+ expected.expectMessage("Failed to execute GPU device detection script");
+ plugin.getGpuDeviceInformation();
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary =
+ new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand("u+x", false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ //make sure to query nvidia-smi correctly, once
+ plugin.getGpuDeviceInformation();
+
+ //replace script with faulty one
+ createwFaultyNvidiaSmiScript(fakeBinary);
+
+ final String terminateMsg = "Failed to execute GPU device " +
+ "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
+ final String msg = "Failed to execute GPU device detection script";
+ //execute faulty nvidia-smi script 10 times
+ for (int i = 0; i < 10; i++) {
+ try {
+ plugin.getGpuDeviceInformation();
+ Assert.fail("Query of GPU device info via nvidia-smi should fail as " +
+ "script should be faulty: " + fakeBinary);
+ } catch (YarnException e) {
+ Assert.assertThat(e.getMessage(), containsString(msg));
+ Assert.assertThat(e.getMessage(), not(containsString(terminateMsg)));
+ }
+ }
+
+ //we should reach the error threshold
+ try {
+ plugin.getGpuDeviceInformation();
+ Assert.fail("Query of GPU device info via nvidia-smi should fail as " +
+ "script should be faulty: " + fakeBinary);
+ } catch (YarnException e) {
+ Assert.assertThat(e.getMessage(), containsString(terminateMsg));
+ }
+
+ //verify if GPUs are still hold the value of first successful query
+ Assert.assertNotNull(plugin.getGpusUsableByYarn());
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary =
+ new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createwNvidiaSmiScriptWithInvalidXml(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand("u+x", false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ expected.expect(YarnException.class);
+ expected.expectMessage("Failed to parse XML output of " +
+ "GPU device detection script");
+ plugin.getGpuDeviceInformation();
+ }
+
@Test
public void testGpuDiscover() throws YarnException {
// Since this is more of a performance unit test, only run if