diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..a2467ec72a1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -41,6 +41,8 @@ import java.util.Map; import java.util.Set; +import static org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE; + @InterfaceAudience.Private @InterfaceStability.Unstable public class GpuDiscoverer { @@ -70,7 +72,7 @@ private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); private int numOfErrorExecutionSinceLastSucceed = 0; - GpuDeviceInformation lastDiscoveredGpuInformation = null; + private GpuDeviceInformation lastDiscoveredGpuInformation = null; private void validateConfOrThrowException() throws YarnException { if (conf == null) { @@ -79,6 +81,27 @@ private void validateConfOrThrowException() throws YarnException { } } + private String getErrorMessageOfScriptExecution(String msg) { + return getFailedToExecuteScriptMessage() + + "! Exception message: " + msg; + } + + private String getErrorMessageOfScriptExecutionThresholdReached() { + return getFailedToExecuteScriptMessage() + " for " + + MAX_REPEATED_ERROR_ALLOWED + " times, " + + "skipping following executions!"; + } + + private String getFailedToExecuteScriptMessage() { + return "Failed to execute " + GPU_SCRIPT_REFERENCE + + " (" + pathOfGpuBinary + ")"; + } + + private String getFailedToParseErrorMessage(String msg) { + return "Failed to parse XML output of " + GPU_SCRIPT_REFERENCE + + "( " + pathOfGpuBinary + ")" + msg; + } + /** * Get GPU device information from system. * This need to be called after initialize. @@ -100,10 +123,7 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() } if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { - String msg = - "Failed to execute GPU device information detection script for " - + MAX_REPEATED_ERROR_ALLOWED - + " times, skip following executions."; + final String msg = getErrorMessageOfScriptExecutionThresholdReached(); LOG.error(msg); throw new YarnException(msg); } @@ -118,16 +138,14 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() return info; } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; - String msg = - "Failed to execute " + pathOfGpuBinary + " exception message:" + e - .getMessage() + ", continue ..."; + final String msg = getErrorMessageOfScriptExecution(e.getMessage()); if (LOG.isDebugEnabled()) { LOG.debug(msg); } - throw new YarnException(e); + throw new YarnException(msg, e); } catch (YarnException e) { numOfErrorExecutionSinceLastSucceed++; - String msg = "Failed to parse xml output" + e.getMessage(); + String msg = getFailedToParseErrorMessage(e.getMessage()); if (LOG.isDebugEnabled()) { LOG.warn(msg, e); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java index 1bd92f63a88..894cb70ca61 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java @@ -43,6 +43,8 @@ public class GpuDeviceInformationParser { private static final Logger LOG = LoggerFactory.getLogger( GpuDeviceInformationParser.class); + public static final String GPU_SCRIPT_REFERENCE = "GPU device detection " + + "script"; private Unmarshaller unmarshaller = null; private XMLReader xmlReader = null; @@ -70,7 +72,9 @@ public synchronized GpuDeviceInformation parseXml(String xmlContent) try { init(); } catch (SAXException | ParserConfigurationException | JAXBException e) { - LOG.error("Exception while initialize parser", e); + String msg = "Exception while initializing parser for " + + GPU_SCRIPT_REFERENCE; + LOG.error(msg, e); throw new YarnException(e); } } @@ -80,8 +84,10 @@ public synchronized GpuDeviceInformation parseXml(String xmlContent) try { return (GpuDeviceInformation) unmarshaller.unmarshal(source); } catch (JAXBException e) { - LOG.error("Exception while parsing xml", e); - throw new YarnException(e); + String msg = "Failed to parse XML output of " + + GPU_SCRIPT_REFERENCE + "!"; + LOG.error(msg, e); + throw new YarnException(msg, e); } } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..285693214c8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -20,20 +20,38 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; -import org.junit.Assert; import org.junit.Assume; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.io.PrintWriter; import java.util.List; +import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows; +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.CoreMatchers.not; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + public class TestGpuDiscoverer { + + private static final String PATH = "PATH"; + private static final String NVIDIA = "nvidia"; + private static final String EXEC_PERMISSION = "u+x"; + private String getTestParentFolder() { File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); return f.getAbsolutePath(); @@ -43,14 +61,48 @@ private void touchFile(File f) throws IOException { new FileOutputStream(f).close(); } + @Rule + public ExpectedException expected = ExpectedException.none(); + @Before public void before() throws IOException { + assumeNotWindows(); String folder = getTestParentFolder(); File f = new File(folder); FileUtils.deleteDirectory(f); f.mkdirs(); } + private static void createNvidiaSmiScript(File file) throws IOException { + PrintWriter fileWriter = new PrintWriter(file); + fileWriter.write("#!/bin/bash\n\n"); + fileWriter.write("echo ''"); + fileWriter.close(); + } + + private static void createwFaultyNvidiaSmiScript(File file) + throws IOException { + PrintWriter fileWriter = new PrintWriter(file); + fileWriter.write("#!/bin/bash\n\n"); + fileWriter.write("echo <<'"); + fileWriter.close(); + } + + private static void createwNvidiaSmiScriptWithInvalidXml(File file) + throws IOException { + PrintWriter fileWriter = new PrintWriter(file); + fileWriter.write("#!/bin/bash\n\n"); + fileWriter.write("echo ''"); + fileWriter.close(); + } + + + private void assertNvidiaIsOnPath(GpuDiscoverer plugin) { + String path = plugin.getEnvironmentToRunCommand().get(PATH); + assertNotNull(path); + assertTrue(path.contains(NVIDIA)); + } + @Test public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { // Only run this on demand. @@ -61,11 +113,9 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); - Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); - Assert.assertTrue( - plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); + assertNvidiaIsOnPath(plugin); // test case 2, check mandatory set path. File fakeBinary = new File(getTestParentFolder(), @@ -74,19 +124,169 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(fakeBinary.getAbsolutePath(), + assertEquals(fakeBinary.getAbsolutePath(), plugin.getPathOfGpuBinary()); - Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); + assertNull(plugin.getEnvironmentToRunCommand().get(PATH)); - // test case 3, check mandatory set path, but binary doesn't exist so default - // path will be used. + // test case 3, check mandatory set path, + // but binary doesn't exist so default path will be used. fakeBinary.delete(); plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + plugin.getPathOfGpuBinary()); + assertNvidiaIsOnPath(plugin); + } + + @Test + public void testGetGpuDeviceInformationValidNvidiaSmiScript() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + + File fakeBinary = new File(getTestParentFolder(), + GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + createNvidiaSmiScript(fakeBinary); + Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false, + fakeBinary.getAbsolutePath())); + + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + assertEquals(fakeBinary.getAbsolutePath(), + plugin.getPathOfGpuBinary()); + assertNull(plugin.getEnvironmentToRunCommand().get(PATH)); + + GpuDeviceInformation result = + plugin.getGpuDeviceInformation(); + assertNotNull(result); + } + + @Test + public void testGetGpuDeviceInformationFakeNvidiaSmiScriptConsecutiveRun() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + + File fakeBinary = new File(getTestParentFolder(), + GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + createNvidiaSmiScript(fakeBinary); + Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false, + fakeBinary.getAbsolutePath())); + + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + assertEquals(fakeBinary.getAbsolutePath(), + plugin.getPathOfGpuBinary()); + assertNull(plugin.getEnvironmentToRunCommand().get(PATH)); + + for (int i = 0; i < 5; i++) { + GpuDeviceInformation result = plugin.getGpuDeviceInformation(); + assertNotNull(result); + } + } + + @Test + public void testGetGpuDeviceInformationFaultyNvidiaSmiScript() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + + File fakeBinary = + new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + createwFaultyNvidiaSmiScript(fakeBinary); + Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false, + fakeBinary.getAbsolutePath())); + + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + assertEquals(fakeBinary.getAbsolutePath(), + plugin.getPathOfGpuBinary()); + assertNull(plugin.getEnvironmentToRunCommand().get(PATH)); + + expected.expect(YarnException.class); + expected.expectMessage("Failed to execute GPU device detection script"); + plugin.getGpuDeviceInformation(); + } + + @Test + public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + + File fakeBinary = + new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + createNvidiaSmiScript(fakeBinary); + Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false, + fakeBinary.getAbsolutePath())); + + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + assertEquals(fakeBinary.getAbsolutePath(), plugin.getPathOfGpuBinary()); - Assert.assertTrue( - plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); + assertNull(plugin.getEnvironmentToRunCommand().get(PATH)); + + //make sure to query nvidia-smi correctly, once + plugin.getGpuDeviceInformation(); + + //replace script with faulty one + createwFaultyNvidiaSmiScript(fakeBinary); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; + final String msg = "Failed to execute GPU device detection script"; + //execute faulty nvidia-smi script 10 times + for (int i = 0; i < 10; i++) { + try { + plugin.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage(), containsString(msg)); + assertThat(e.getMessage(), not(containsString(terminateMsg))); + } + } + + //we should reach the error threshold + try { + plugin.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage(), containsString(terminateMsg)); + } + + //verify if GPUs are still hold the value of first successful query + assertNotNull(plugin.getGpusUsableByYarn()); + } + + @Test + public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + + File fakeBinary = + new File(getTestParentFolder(), GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + createwNvidiaSmiScriptWithInvalidXml(fakeBinary); + Shell.execCommand(Shell.getSetPermissionCommand(EXEC_PERMISSION, false, + fakeBinary.getAbsolutePath())); + + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + assertEquals(fakeBinary.getAbsolutePath(), + plugin.getPathOfGpuBinary()); + assertNull(plugin.getEnvironmentToRunCommand().get(PATH)); + + expected.expect(YarnException.class); + expected.expectMessage("Failed to parse XML output of " + + "GPU device detection script"); + plugin.getGpuDeviceInformation(); } @Test @@ -100,8 +300,8 @@ public void testGpuDiscover() throws YarnException { plugin.initialize(conf); GpuDeviceInformation info = plugin.getGpuDeviceInformation(); - Assert.assertTrue(info.getGpus().size() > 0); - Assert.assertEquals(plugin.getGpusUsableByYarn().size(), + assertTrue(info.getGpus().size() > 0); + assertEquals(plugin.getGpusUsableByYarn().size(), info.getGpus().size()); } @@ -115,7 +315,7 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException { try { plugin.initialize(conf); plugin.getGpusUsableByYarn(); - Assert.fail("Illegal format, should fail."); + fail("Illegal format, should fail."); } catch (YarnException e) { // Expected } @@ -126,16 +326,16 @@ public void getNumberOfUsableGpusFromConfig() throws YarnException { plugin.initialize(conf); List usableGpuDevices = plugin.getGpusUsableByYarn(); - Assert.assertEquals(4, usableGpuDevices.size()); + assertEquals(4, usableGpuDevices.size()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); - Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + assertEquals(0, usableGpuDevices.get(0).getIndex()); + assertEquals(1, usableGpuDevices.get(1).getIndex()); + assertEquals(2, usableGpuDevices.get(2).getIndex()); + assertEquals(3, usableGpuDevices.get(3).getIndex()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); - Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + assertEquals(0, usableGpuDevices.get(0).getMinorNumber()); + assertEquals(1, usableGpuDevices.get(1).getMinorNumber()); + assertEquals(2, usableGpuDevices.get(2).getMinorNumber()); + assertEquals(4, usableGpuDevices.get(3).getMinorNumber()); } }