diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 6e3cf1315ce..a2467ec72a1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -41,6 +41,8 @@
import java.util.Map;
import java.util.Set;
+import static org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE;
+
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class GpuDiscoverer {
@@ -70,7 +72,7 @@
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
private int numOfErrorExecutionSinceLastSucceed = 0;
- GpuDeviceInformation lastDiscoveredGpuInformation = null;
+ private GpuDeviceInformation lastDiscoveredGpuInformation = null;
private void validateConfOrThrowException() throws YarnException {
if (conf == null) {
@@ -79,6 +81,27 @@ private void validateConfOrThrowException() throws YarnException {
}
}
+ private String getErrorMessageOfScriptExecution(String msg) {
+ return getFailedToExecuteScriptMessage() +
+ "! Exception message: " + msg;
+ }
+
+ private String getErrorMessageOfScriptExecutionThresholdReached() {
+ return getFailedToExecuteScriptMessage() + " for " +
+ MAX_REPEATED_ERROR_ALLOWED + " times, " +
+ "skipping following executions!";
+ }
+
+ private String getFailedToExecuteScriptMessage() {
+ return "Failed to execute " + GPU_SCRIPT_REFERENCE +
+ " (" + pathOfGpuBinary + ")";
+ }
+
+ private String getFailedToParseErrorMessage(String msg) {
+ return "Failed to parse XML output of " + GPU_SCRIPT_REFERENCE
+ + "( " + pathOfGpuBinary + ")" + msg;
+ }
+
/**
* Get GPU device information from system.
* This need to be called after initialize.
@@ -100,10 +123,7 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
}
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
- String msg =
- "Failed to execute GPU device information detection script for "
- + MAX_REPEATED_ERROR_ALLOWED
- + " times, skip following executions.";
+ final String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg);
throw new YarnException(msg);
}
@@ -118,16 +138,14 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
return info;
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
- String msg =
- "Failed to execute " + pathOfGpuBinary + " exception message:" + e
- .getMessage() + ", continue ...";
+ final String msg = getErrorMessageOfScriptExecution(e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.debug(msg);
}
- throw new YarnException(e);
+ throw new YarnException(msg, e);
} catch (YarnException e) {
numOfErrorExecutionSinceLastSucceed++;
- String msg = "Failed to parse xml output" + e.getMessage();
+ String msg = getFailedToParseErrorMessage(e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.warn(msg, e);
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
index 1bd92f63a88..894cb70ca61 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -43,6 +43,8 @@
public class GpuDeviceInformationParser {
private static final Logger LOG = LoggerFactory.getLogger(
GpuDeviceInformationParser.class);
+ public static final String GPU_SCRIPT_REFERENCE = "GPU device detection " +
+ "script";
private Unmarshaller unmarshaller = null;
private XMLReader xmlReader = null;
@@ -70,7 +72,9 @@ public synchronized GpuDeviceInformation parseXml(String xmlContent)
try {
init();
} catch (SAXException | ParserConfigurationException | JAXBException e) {
- LOG.error("Exception while initialize parser", e);
+ String msg = "Exception while initializing parser for " +
+ GPU_SCRIPT_REFERENCE;
+ LOG.error(msg, e);
throw new YarnException(e);
}
}
@@ -80,8 +84,10 @@ public synchronized GpuDeviceInformation parseXml(String xmlContent)
try {
return (GpuDeviceInformation) unmarshaller.unmarshal(source);
} catch (JAXBException e) {
- LOG.error("Exception while parsing xml", e);
- throw new YarnException(e);
+ String msg = "Failed to parse XML output of " +
+ GPU_SCRIPT_REFERENCE + "!";
+ LOG.error(msg, e);
+ throw new YarnException(msg, e);
}
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index 4abb633a69a..285693214c8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -20,20 +20,38 @@
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
-import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
+import org.junit.Rule;
import org.junit.Test;
+import org.junit.rules.ExpectedException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.PrintWriter;
import java.util.List;
+import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.not;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
public class TestGpuDiscoverer {
+
+ private static final String PATH = "PATH";
+ private static final String NVIDIA = "nvidia";
+ private static final String EXEC_PERMISSION = "u+x";
+
private String getTestParentFolder() {
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
return f.getAbsolutePath();
@@ -43,14 +61,48 @@ private void touchFile(File f) throws IOException {
new FileOutputStream(f).close();
}
+ @Rule
+ public ExpectedException expected = ExpectedException.none();
+
@Before
public void before() throws IOException {
+ assumeNotWindows();
String folder = getTestParentFolder();
File f = new File(folder);
FileUtils.deleteDirectory(f);
f.mkdirs();
}
+ private static void createNvidiaSmiScript(File file) throws IOException {
+ PrintWriter fileWriter = new PrintWriter(file);
+ fileWriter.write("#!/bin/bash\n\n");
+ fileWriter.write("echo ''");
+ fileWriter.close();
+ }
+
+ private static void createwFaultyNvidiaSmiScript(File file)
+ throws IOException {
+ PrintWriter fileWriter = new PrintWriter(file);
+ fileWriter.write("#!/bin/bash\n\n");
+ fileWriter.write("echo <<'");
+ fileWriter.close();
+ }
+
+ private static void createwNvidiaSmiScriptWithInvalidXml(File file)
+ throws IOException {
+ PrintWriter fileWriter = new PrintWriter(file);
+ fileWriter.write("#!/bin/bash\n\n");
+ fileWriter.write("echo ''");
+ fileWriter.close();
+ }
+
+
+ private void assertNvidiaIsOnPath(GpuDiscoverer plugin) {
+ String path = plugin.getEnvironmentToRunCommand().get(PATH);
+ assertNotNull(path);
+ assertTrue(path.contains(NVIDIA));
+ }
+
@Test
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
// Only run this on demand.
@@ -61,11 +113,9 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
Configuration conf = new Configuration(false);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
- Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
plugin.getPathOfGpuBinary());
- Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
- Assert.assertTrue(
- plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+ assertNvidiaIsOnPath(plugin);
// test case 2, check mandatory set path.
File fakeBinary = new File(getTestParentFolder(),
@@ -74,19 +124,169 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
plugin = new GpuDiscoverer();
plugin.initialize(conf);
- Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ assertEquals(fakeBinary.getAbsolutePath(),
plugin.getPathOfGpuBinary());
- Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+ assertNull(plugin.getEnvironmentToRunCommand().get(PATH));
- // test case 3, check mandatory set path, but binary doesn't exist so default
- // path will be used.
+ // test case 3, check mandatory set path,
+ // but binary doesn't exist so default path will be used.
fakeBinary.delete();
plugin = new GpuDiscoverer();
plugin.initialize(conf);
- Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ assertNvidiaIsOnPath(plugin);
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationValidNvidiaSmiScript()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ assertNull(plugin.getEnvironmentToRunCommand().get(PATH));
+
+ GpuDeviceInformation result =
+ plugin.getGpuDeviceInformation();
+ assertNotNull(result);
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationFakeNvidiaSmiScriptConsecutiveRun()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ assertNull(plugin.getEnvironmentToRunCommand().get(PATH));
+
+ for (int i = 0; i < 5; i++) {
+ GpuDeviceInformation result = plugin.getGpuDeviceInformation();
+ assertNotNull(result);
+ }
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationFaultyNvidiaSmiScript()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary =
+ new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createwFaultyNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ assertNull(plugin.getEnvironmentToRunCommand().get(PATH));
+
+ expected.expect(YarnException.class);
+ expected.expectMessage("Failed to execute GPU device detection script");
+ plugin.getGpuDeviceInformation();
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary =
+ new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createNvidiaSmiScript(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
plugin.getPathOfGpuBinary());
- Assert.assertTrue(
- plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+ assertNull(plugin.getEnvironmentToRunCommand().get(PATH));
+
+ //make sure to query nvidia-smi correctly, once
+ plugin.getGpuDeviceInformation();
+
+ //replace script with faulty one
+ createwFaultyNvidiaSmiScript(fakeBinary);
+
+ final String terminateMsg = "Failed to execute GPU device " +
+ "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
+ final String msg = "Failed to execute GPU device detection script";
+ //execute faulty nvidia-smi script 10 times
+ for (int i = 0; i < 10; i++) {
+ try {
+ plugin.getGpuDeviceInformation();
+ fail("Query of GPU device info via nvidia-smi should fail as " +
+ "script should be faulty: " + fakeBinary);
+ } catch (YarnException e) {
+ assertThat(e.getMessage(), containsString(msg));
+ assertThat(e.getMessage(), not(containsString(terminateMsg)));
+ }
+ }
+
+ //we should reach the error threshold
+ try {
+ plugin.getGpuDeviceInformation();
+ fail("Query of GPU device info via nvidia-smi should fail as " +
+ "script should be faulty: " + fakeBinary);
+ } catch (YarnException e) {
+ assertThat(e.getMessage(), containsString(terminateMsg));
+ }
+
+ //verify if GPUs are still hold the value of first successful query
+ assertNotNull(plugin.getGpusUsableByYarn());
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+
+ File fakeBinary =
+ new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ createwNvidiaSmiScriptWithInvalidXml(fakeBinary);
+ Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false,
+ fakeBinary.getAbsolutePath()));
+
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ assertNull(plugin.getEnvironmentToRunCommand().get(PATH));
+
+ expected.expect(YarnException.class);
+ expected.expectMessage("Failed to parse XML output of " +
+ "GPU device detection script");
+ plugin.getGpuDeviceInformation();
}
@Test
@@ -100,8 +300,8 @@ public void testGpuDiscover() throws YarnException {
plugin.initialize(conf);
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
- Assert.assertTrue(info.getGpus().size() > 0);
- Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
+ assertTrue(info.getGpus().size() > 0);
+ assertEquals(plugin.getGpusUsableByYarn().size(),
info.getGpus().size());
}
@@ -115,7 +315,7 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException {
try {
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
- Assert.fail("Illegal format, should fail.");
+ fail("Illegal format, should fail.");
} catch (YarnException e) {
// Expected
}
@@ -126,16 +326,16 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException {
plugin.initialize(conf);
List usableGpuDevices = plugin.getGpusUsableByYarn();
- Assert.assertEquals(4, usableGpuDevices.size());
+ assertEquals(4, usableGpuDevices.size());
- Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
- Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
- Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
- Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
+ assertEquals(0, usableGpuDevices.get(0).getIndex());
+ assertEquals(1, usableGpuDevices.get(1).getIndex());
+ assertEquals(2, usableGpuDevices.get(2).getIndex());
+ assertEquals(3, usableGpuDevices.get(3).getIndex());
- Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
- Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
- Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
- Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
+ assertEquals(0, usableGpuDevices.get(0).getMinorNumber());
+ assertEquals(1, usableGpuDevices.get(1).getMinorNumber());
+ assertEquals(2, usableGpuDevices.get(2).getMinorNumber());
+ assertEquals(4, usableGpuDevices.get(3).getMinorNumber());
}
}