diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..949fabf551b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -67,7 +67,6 @@ private Configuration conf = null; private String pathOfGpuBinary = null; private Map environment = new HashMap<>(); - private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); private int numOfErrorExecutionSinceLastSucceed = 0; GpuDeviceInformation lastDiscoveredGpuInformation = null; @@ -112,6 +111,7 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() try { output = Shell.execCommand(environment, new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); GpuDeviceInformation info = parser.parseXml(output); numOfErrorExecutionSinceLastSucceed = 0; lastDiscoveredGpuInformation = info; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java index 837d5cc99cd..c830d432a6b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java @@ -21,6 +21,7 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; +import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import java.util.List; @@ -31,14 +32,10 @@ @InterfaceStability.Unstable @XmlRootElement(name = "nvidia_smi_log") public class GpuDeviceInformation { - List gpus; + private List gpus; + private String driverVersion = "N/A"; - String driverVersion = "N/A"; - - // More fields like topology information could be added when needed. - // ... - - @javax.xml.bind.annotation.XmlElement(name = "gpu") + @XmlElement(name = "gpu") public List getGpus() { return gpus; } @@ -47,7 +44,7 @@ public void setGpus(List gpus) { this.gpus = gpus; } - @javax.xml.bind.annotation.XmlElement(name = "driver_version") + @XmlElement(name = "driver_version") public String getDriverVersion() { return driverVersion; } @@ -59,8 +56,9 @@ public void setDriverVersion(String driverVersion) { @Override public String toString() { StringBuilder sb = new StringBuilder(); - sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append( - getDriverVersion()).append("\n"); + sb.append("=== GPUs in the system ===\n") + .append("\tDriver Version:") + .append(getDriverVersion()).append("\n"); if (gpus != null) { for (PerGpuDeviceInformation gpu : gpus) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java index 1bd92f63a88..7cf98a98f43 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java @@ -24,13 +24,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.InputSource; -import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; -import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.sax.SAXSource; import java.io.StringReader; @@ -44,43 +42,44 @@ private static final Logger LOG = LoggerFactory.getLogger( GpuDeviceInformationParser.class); - private Unmarshaller unmarshaller = null; - private XMLReader xmlReader = null; + private final Unmarshaller unmarshaller; + private final XMLReader xmlReader; - private void init() - throws SAXException, ParserConfigurationException, JAXBException { + public GpuDeviceInformationParser() throws YarnException { + try { + final SAXParserFactory parserFactory = initSaxParserFactory(); + final JAXBContext jaxbContext = JAXBContext.newInstance( + GpuDeviceInformation.class); + this.xmlReader = parserFactory.newSAXParser().getXMLReader(); + this.unmarshaller = jaxbContext.createUnmarshaller(); + } catch (Exception e) { + LOG.error("Exception while initialize the " + + "GPU device information parser!", e); + throw new YarnException(e); + } + } + + /** + * Disable external-dtd since by default nvidia-smi output contains + * <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header. + */ + private SAXParserFactory initSaxParserFactory() throws Exception { SAXParserFactory spf = SAXParserFactory.newInstance(); - // Disable external-dtd since by default nvidia-smi output contains - // in header spf.setFeature( "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); spf.setFeature("http://xml.org/sax/features/validation", false); - - JAXBContext jaxbContext = JAXBContext.newInstance( - GpuDeviceInformation.class); - - this.xmlReader = spf.newSAXParser().getXMLReader(); - this.unmarshaller = jaxbContext.createUnmarshaller(); + return spf; } public synchronized GpuDeviceInformation parseXml(String xmlContent) throws YarnException { - if (unmarshaller == null) { - try { - init(); - } catch (SAXException | ParserConfigurationException | JAXBException e) { - LOG.error("Exception while initialize parser", e); - throw new YarnException(e); - } - } - InputSource inputSource = new InputSource(new StringReader(xmlContent)); SAXSource source = new SAXSource(xmlReader, inputSource); try { return (GpuDeviceInformation) unmarshaller.unmarshal(source); } catch (JAXBException e) { - LOG.error("Exception while parsing xml", e); + LOG.error("Exception while parsing GPU device information XML!", e); throw new YarnException(e); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java index 25c2e3a1f1d..11ff2a4c49c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java @@ -33,7 +33,6 @@ @InterfaceStability.Unstable @XmlRootElement(name = "gpu") public class PerGpuDeviceInformation { - private String productName = "N/A"; private String uuid = "N/A"; private int minorNumber = -1; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java index afc1a9679b7..1c2c0c49b84 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java @@ -29,8 +29,8 @@ @InterfaceStability.Unstable @XmlRootElement(name = "fb_memory_usage") public class PerGpuMemoryUsage { - long usedMemoryMiB = -1L; - long availMemoryMiB = -1L; + private long usedMemoryMiB = -1L; + private long availMemoryMiB = -1L; @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class) @XmlElement(name = "used") @@ -53,6 +53,9 @@ public void setAvailMemoryMiB(Long availMemoryMiB) { } public long getTotalMemoryMiB() { + if (usedMemoryMiB == -1 && availMemoryMiB == -1) { + return -1; + } return usedMemoryMiB + availMemoryMiB; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java index dc96746cf5d..062eb7be114 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java @@ -20,31 +20,159 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.yarn.exceptions.YarnException; -import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.io.IOException; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + public class TestGpuDeviceInformationParser { + private static final double FLOAT_DELTA = 1e-6; + + @Rule + public ExpectedException expected = ExpectedException.none(); + @Test public void testParse() throws IOException, YarnException { - File f = new File("src/test/resources/nvidia-smi-sample-xml-output"); + File f = new File("src/test/resources/nvidia-smi-sample-output.xml"); + String s = FileUtils.readFileToString(f, "UTF-8"); + + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + GpuDeviceInformation info = parser.parseXml(s); + + assertEquals("375.66", info.getDriverVersion()); + assertEquals(2, info.getGpus().size()); + assertFirstGpu(info.getGpus().get(0)); + assertSecondGpu(info.getGpus().get(1)); + } + + @Test + public void testParseExcerpt() throws IOException, YarnException { + File f = new File("src/test/resources/nvidia-smi-output-excerpt.xml"); + String s = FileUtils.readFileToString(f, "UTF-8"); + + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + GpuDeviceInformation info = parser.parseXml(s); + + assertEquals("375.66", info.getDriverVersion()); + assertEquals(2, info.getGpus().size()); + assertFirstGpu(info.getGpus().get(0)); + assertSecondGpu(info.getGpus().get(1)); + } + + @Test + public void testParseConsecutivelyWithSameParser() + throws IOException, YarnException { + File f = new File("src/test/resources/nvidia-smi-sample-output.xml"); + String s = FileUtils.readFileToString(f, "UTF-8"); + + for (int i = 0; i < 3; i++) { + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + GpuDeviceInformation info = parser.parseXml(s); + + assertEquals("375.66", info.getDriverVersion()); + assertEquals(2, info.getGpus().size()); + assertFirstGpu(info.getGpus().get(0)); + assertSecondGpu(info.getGpus().get(1)); + } + } + + @Test + public void testParseEmptyString() throws YarnException { + expected.expect(YarnException.class); + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + parser.parseXml(""); + } + + @Test + public void testParseInvalidRootElement() throws YarnException { + expected.expect(YarnException.class); + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + parser.parseXml(" + + + + + + Wed Sep 6 21:52:51 2017 + 375.66 + 2 + + Tesla P100-PCIE-12GB + Tesla + GPU-28604e81-21ec-cc48-6759-bf2648b22e16 + 0 + + 11567 MiB + 11400 MiB + 167 MiB + + + 33.4 % + 0 % + 0 % + 0 % + + + 31 C + 80 C + 88 C + + + + + Tesla P100-PCIE-12GB_2 + Tesla + GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3 + 1 + + 12290 MiB + 11800 MiB + 490 MiB + + Default + + 10.3 % + 0 % + 0 % + 0 % + + + 34 C + 85 C + 82 C + + + + \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-output-missing-tags.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-output-missing-tags.xml new file mode 100644 index 00000000000..644b95f1fcf --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-output-missing-tags.xml @@ -0,0 +1,28 @@ + + + + + + + Wed Sep 6 21:52:51 2017 + 375.66 + 2 + + Tesla + + + \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-output-missing-tags2.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-output-missing-tags2.xml new file mode 100644 index 00000000000..8b6d756be82 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-output-missing-tags2.xml @@ -0,0 +1,61 @@ + + + + + + + Wed Sep 6 21:52:51 2017 + 375.66 + 2 + + Tesla P100-PCIE-12GB + Tesla + GPU-28604e81-21ec-cc48-6759-bf2648b22e16 + 0 + + + + + + + + + + Tesla P100-PCIE-12GB_2 + Tesla + GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3 + 1 + + 12290 MiB + 11800 MiB + 490 MiB + + Default + + 10.3 % + 0 % + 0 % + 0 % + + + 34 C + 85 C + 82 C + + + + \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-output.xml similarity index 98% rename from hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-output.xml index 5ccb72265b5..14662cf9147 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-output.xml @@ -90,9 +90,9 @@ Not Active - 12193 MiB - 0 MiB - 12193 MiB + 11567 MiB + 11400 MiB + 167 MiB 16384 MiB @@ -101,7 +101,7 @@ Default - 0 % + 33.4 % 0 % 0 % 0 % @@ -172,8 +172,8 @@ 31 C - 85 C - 82 C + 80 C + 88 C P0 @@ -284,7 +284,7 @@ - Tesla P100-PCIE-12GB + Tesla P100-PCIE-12GB_2 Tesla Disabled Disabled @@ -351,9 +351,9 @@ Not Active - 12193 MiB - 0 MiB - 12193 MiB + 12290 MiB + 11800 MiB + 490 MiB 16384 MiB