gpus) {
+ this.gpus = gpus;
+ }
+
+ @javax.xml.bind.annotation.XmlElement(name = "driver_version")
+ public String getDriverVersion() {
+ return driverVersion;
+ }
+
+ public void setDriverVersion(String driverVersion) {
+ this.driverVersion = driverVersion;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
+ getDriverVersion()).append("\n");
+
+ if (gpus != null) {
+ for (PerGpuDeviceInformation gpu : gpus) {
+ sb.append("\t").append(gpu.toString()).append("\n");
+ }
+ }
+ return sb.toString();
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
new file mode 100644
index 00000000000..1bd92f63a88
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.sax.SAXSource;
+import java.io.StringReader;
+
+/**
+ * Parse XML and get GPU device information
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDeviceInformationParser {
+ private static final Logger LOG = LoggerFactory.getLogger(
+ GpuDeviceInformationParser.class);
+
+ private Unmarshaller unmarshaller = null;
+ private XMLReader xmlReader = null;
+
+ private void init()
+ throws SAXException, ParserConfigurationException, JAXBException {
+ SAXParserFactory spf = SAXParserFactory.newInstance();
+ // Disable external-dtd since by default nvidia-smi output contains
+ // in header
+ spf.setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ spf.setFeature("http://xml.org/sax/features/validation", false);
+
+ JAXBContext jaxbContext = JAXBContext.newInstance(
+ GpuDeviceInformation.class);
+
+ this.xmlReader = spf.newSAXParser().getXMLReader();
+ this.unmarshaller = jaxbContext.createUnmarshaller();
+ }
+
+ public synchronized GpuDeviceInformation parseXml(String xmlContent)
+ throws YarnException {
+ if (unmarshaller == null) {
+ try {
+ init();
+ } catch (SAXException | ParserConfigurationException | JAXBException e) {
+ LOG.error("Exception while initialize parser", e);
+ throw new YarnException(e);
+ }
+ }
+
+ InputSource inputSource = new InputSource(new StringReader(xmlContent));
+ SAXSource source = new SAXSource(xmlReader, inputSource);
+ try {
+ return (GpuDeviceInformation) unmarshaller.unmarshal(source);
+ } catch (JAXBException e) {
+ LOG.error("Exception while parsing xml", e);
+ throw new YarnException(e);
+ }
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
new file mode 100644
index 00000000000..f3153136b6a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlAdapter;
+
+/**
+ * Capture single GPU device information such as memory size, temperature,
+ * utilization.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "gpu")
+public class PerGpuDeviceInformation {
+
+ private String productName = "N/A";
+ private String uuid = "N/A";
+ private int minorNumber = -1;
+
+ private PerGpuUtilizations gpuUtilizations;
+ private PerGpuMemoryUsage gpuMemoryUsage;
+ private PerGpuTemperature temperature;
+
+ /**
+ * Convert formats like "34 C", "75.6 %" to float.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToFloatBeforeSpaceAdapter extends
+ XmlAdapter {
+ @Override
+ public String marshal(Float v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v);
+ }
+
+ @Override
+ public Float unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1f;
+ }
+
+ return Float.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ /**
+ * Convert formats like "725 MiB" to long.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToMemAdapter extends XmlAdapter {
+ @Override
+ public String marshal(Long v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v) + " MiB";
+ }
+
+ @Override
+ public Long unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1L;
+ }
+ return Long.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ @XmlElement(name = "temperature")
+ public PerGpuTemperature getTemperature() {
+ return temperature;
+ }
+
+ public void setTemperature(PerGpuTemperature temperature) {
+ this.temperature = temperature;
+ }
+
+ @XmlElement(name = "uuid")
+ public String getUuid() {
+ return uuid;
+ }
+
+ public void setUuid(String uuid) {
+ this.uuid = uuid;
+ }
+
+ @XmlElement(name = "product_name")
+ public String getProductName() {
+ return productName;
+ }
+
+ public void setProductName(String productName) {
+ this.productName = productName;
+ }
+
+ @XmlElement(name = "minor_number")
+ public int getMinorNumber() {
+ return minorNumber;
+ }
+
+ public void setMinorNumber(int minorNumber) {
+ this.minorNumber = minorNumber;
+ }
+
+ @XmlElement(name = "utilization")
+ public PerGpuUtilizations getGpuUtilizations() {
+ return gpuUtilizations;
+ }
+
+ public void setGpuUtilizations(PerGpuUtilizations utilizations) {
+ this.gpuUtilizations = utilizations;
+ }
+
+ @XmlElement(name = "bar1_memory_usage")
+ public PerGpuMemoryUsage getGpuMemoryUsage() {
+ return gpuMemoryUsage;
+ }
+
+ public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
+ this.gpuMemoryUsage = gpuMemoryUsage;
+ }
+
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("ProductName=").append(productName).append(", MinorNumber=")
+ .append(minorNumber);
+
+ if (getGpuMemoryUsage() != null) {
+ sb.append(", TotalMemory=").append(
+ getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
+ }
+
+ if (getGpuUtilizations() != null) {
+ sb.append(", Utilization=").append(
+ getGpuUtilizations().getOverallGpuUtilization()).append("%");
+ }
+ return sb.toString();
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
new file mode 100644
index 00000000000..3964c4e415e
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "bar1_memory_usage")
+public class PerGpuMemoryUsage {
+ long usedMemoryMiB = -1L;
+ long availMemoryMiB = -1L;
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "used")
+ public Long getUsedMemoryMiB() {
+ return usedMemoryMiB;
+ }
+
+ public void setUsedMemoryMiB(Long usedMemoryMiB) {
+ this.usedMemoryMiB = usedMemoryMiB;
+ }
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "free")
+ public Long getAvailMemoryMiB() {
+ return availMemoryMiB;
+ }
+
+ public void setAvailMemoryMiB(Long availMemoryMiB) {
+ this.availMemoryMiB = availMemoryMiB;
+ }
+
+ public long getTotalMemoryMiB() {
+ return usedMemoryMiB + availMemoryMiB;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
new file mode 100644
index 00000000000..ccd60cbf5e5
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * Temperature of GPU
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "temperature")
+public class PerGpuTemperature {
+ private float currentGpuTemp = Float.MIN_VALUE;
+ private float maxGpuTemp = Float.MIN_VALUE;
+ private float slowThresholdGpuTemp = Float.MIN_VALUE;
+
+ /**
+ * Get current celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp")
+ public Float getCurrentGpuTemp() {
+ return currentGpuTemp;
+ }
+
+ public void setCurrentGpuTemp(Float currentGpuTemp) {
+ this.currentGpuTemp = currentGpuTemp;
+ }
+
+ /**
+ * Get max possible celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_max_threshold")
+ public Float getMaxGpuTemp() {
+ return maxGpuTemp;
+ }
+
+ public void setMaxGpuTemp(Float maxGpuTemp) {
+ this.maxGpuTemp = maxGpuTemp;
+ }
+
+ /**
+ * Get celsius GPU temperature which could make GPU runs slower
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_slow_threshold")
+ public Float getSlowThresholdGpuTemp() {
+ return slowThresholdGpuTemp;
+ }
+
+ public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
+ this.slowThresholdGpuTemp = slowThresholdGpuTemp;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
new file mode 100644
index 00000000000..4ef218ba7ea
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * GPU utilizations
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "utilization")
+public class PerGpuUtilizations {
+ private float overallGpuUtilization;
+
+ /**
+ * Overall percent GPU utilization
+ * @return utilization
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_util")
+ public Float getOverallGpuUtilization() {
+ return overallGpuUtilization;
+ }
+
+ public void setOverallGpuUtilization(Float overallGpuUtilization) {
+ this.overallGpuUtilization = overallGpuUtilization;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
new file mode 100644
index 00000000000..13b3ee91bdc
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.net.ServerSocketUtil;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.api.ResourceTracker;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
+import org.junit.Assert;
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+public class NodeManagerTestBase {
+ // temp fix until metrics system can auto-detect itself running in unit test:
+ static {
+ DefaultMetricsSystem.setMiniClusterMode(true);
+ }
+
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(TestNodeStatusUpdater.class);
+ protected static final File basedir =
+ new File("target", TestNodeStatusUpdater.class.getName());
+ protected static final File nmLocalDir = new File(basedir, "nm0");
+ protected static final File tmpDir = new File(basedir, "tmpDir");
+ protected static final File remoteLogsDir = new File(basedir, "remotelogs");
+ protected static final File logsDir = new File(basedir, "logs");
+ protected static final RecordFactory recordFactory = RecordFactoryProvider
+ .getRecordFactory(null);
+ protected Configuration conf;
+
+ protected YarnConfiguration createNMConfig() throws IOException {
+ return createNMConfig(ServerSocketUtil.getPort(49170, 10));
+ }
+
+ protected YarnConfiguration createNMConfig(int port) throws IOException {
+ YarnConfiguration conf = new YarnConfiguration();
+ String localhostAddress = null;
+ try {
+ localhostAddress = InetAddress.getByName("localhost")
+ .getCanonicalHostName();
+ } catch (UnknownHostException e) {
+ Assert.fail("Unable to get localhost address: " + e.getMessage());
+ }
+ conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
+ conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
+ conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+ + ServerSocketUtil.getPort(49160, 10));
+ conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
+ remoteLogsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+ conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
+ return conf;
+ }
+
+ public static class BaseResourceTrackerForTest implements ResourceTracker {
+ @Override
+ public RegisterNodeManagerResponse registerNodeManager(
+ RegisterNodeManagerRequest request) throws YarnException, IOException {
+ return new RegisterNodeManagerResponsePBImpl();
+ }
+
+ @Override
+ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
+ throws YarnException, IOException {
+ return new NodeHeartbeatResponsePBImpl();
+ }
+
+ @Override
+ public UnRegisterNodeManagerResponse unRegisterNodeManager(
+ UnRegisterNodeManagerRequest request)
+ throws YarnException, IOException {
+ return new UnRegisterNodeManagerResponsePBImpl();
+ }
+ }
+
+ protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
+ public ResourceTracker resourceTracker;
+ protected Context context;
+
+ public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
+ NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
+ ResourceTracker resourceTracker) {
+ super(context, dispatcher, healthChecker, metrics);
+ this.context = context;
+ this.resourceTracker = resourceTracker;
+ }
+ @Override
+ protected ResourceTracker getRMClient() {
+ return resourceTracker;
+ }
+
+ @Override
+ protected void stopRMProxy() {
+ return;
+ }
+ }
+
+ public class MyContainerManager extends ContainerManagerImpl {
+ public boolean signaled = false;
+
+ public MyContainerManager(Context context, ContainerExecutor exec,
+ DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
+ NodeManagerMetrics metrics,
+ LocalDirsHandlerService dirsHandler) {
+ super(context, exec, deletionContext, nodeStatusUpdater,
+ metrics, dirsHandler);
+ }
+
+ @Override
+ public void handle(ContainerManagerEvent event) {
+ if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
+ signaled = true;
+ }
+ }
+ }
+
+ @Before
+ public void setUp() throws IOException {
+ nmLocalDir.mkdirs();
+ tmpDir.mkdirs();
+ logsDir.mkdirs();
+ remoteLogsDir.mkdirs();
+ conf = createNMConfig();
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
index 2e9eff529cd..9b180c7eff6 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
@@ -178,7 +178,7 @@ public void testDirPermissions() throws Exception {
FileContext lfs = FileContext.getLocalFSFileContext(conf);
DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
executor.setConf(conf);
- executor.init();
+ executor.init(null);
try {
executor.createUserLocalDirs(localDirs, user);
@@ -317,7 +317,7 @@ public Object answer(InvocationOnMock invocationOnMock)
Path workDir = localDir;
Path pidFile = new Path(workDir, "pid.txt");
- mockExec.init();
+ mockExec.init(null);
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
.setContainer(container)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index d4db6b0e20e..dcec4c3b0b4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -628,7 +628,7 @@ public void testPostExecuteAfterReacquisition() throws Exception {
LinuxContainerExecutor lce = new LinuxContainerExecutor();
lce.setConf(conf);
try {
- lce.init();
+ lce.init(null);
} catch (IOException e) {
// expected if LCE isn't setup right, but not necessary for this test
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
index 6ca96db0fea..5d82c49cd30 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
@@ -404,7 +404,7 @@ public Object answer(InvocationOnMock invocationOnMock)
@Test
public void testInit() throws Exception {
- mockExec.init();
+ mockExec.init(mock(Context.class));
assertEquals(Arrays.asList("--checksetup"), readMockParams());
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
index 92797116075..b31215b0f3d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
@@ -37,7 +37,7 @@
public static final class InvalidContainerExecutor extends
DefaultContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
throw new IOException("dummy executor init called");
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
index 11c3c356f32..8435340164a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
@@ -20,16 +20,14 @@
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
-import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
@@ -81,8 +79,6 @@
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
-import org.apache.hadoop.yarn.factories.RecordFactory;
-import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
@@ -118,41 +114,14 @@
import org.junit.Test;
@SuppressWarnings("rawtypes")
-public class TestNodeStatusUpdater {
-
- // temp fix until metrics system can auto-detect itself running in unit test:
- static {
- DefaultMetricsSystem.setMiniClusterMode(true);
- }
-
- static final Logger LOG =
- LoggerFactory.getLogger(TestNodeStatusUpdater.class);
- static final File basedir =
- new File("target", TestNodeStatusUpdater.class.getName());
- static final File nmLocalDir = new File(basedir, "nm0");
- static final File tmpDir = new File(basedir, "tmpDir");
- static final File remoteLogsDir = new File(basedir, "remotelogs");
- static final File logsDir = new File(basedir, "logs");
- private static final RecordFactory recordFactory = RecordFactoryProvider
- .getRecordFactory(null);
-
+public class TestNodeStatusUpdater extends NodeManagerTestBase {
volatile int heartBeatID = 0;
volatile Throwable nmStartError = null;
private final List registeredNodes = new ArrayList();
private boolean triggered = false;
- private Configuration conf;
private NodeManager nm;
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
- @Before
- public void setUp() throws IOException {
- nmLocalDir.mkdirs();
- tmpDir.mkdirs();
- logsDir.mkdirs();
- remoteLogsDir.mkdirs();
- conf = createNMConfig();
- }
-
@After
public void tearDown() {
this.registeredNodes.clear();
@@ -334,29 +303,7 @@ public UnRegisterNodeManagerResponse unRegisterNodeManager(
}
}
- private class MyContainerManager extends ContainerManagerImpl {
- public boolean signaled = false;
-
- public MyContainerManager(Context context, ContainerExecutor exec,
- DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
- NodeManagerMetrics metrics,
- LocalDirsHandlerService dirsHandler) {
- super(context, exec, deletionContext, nodeStatusUpdater,
- metrics, dirsHandler);
- }
-
- @Override
- public void handle(ContainerManagerEvent event) {
- if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
- signaled = true;
- }
- }
- }
-
- private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
- public ResourceTracker resourceTracker;
- private Context context;
-
+ private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
this(context, dispatcher, healthChecker, metrics, false);
@@ -365,19 +312,8 @@ public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
boolean signalContainer) {
- super(context, dispatcher, healthChecker, metrics);
- this.context = context;
- resourceTracker = new MyResourceTracker(this.context, signalContainer);
- }
-
- @Override
- protected ResourceTracker getRMClient() {
- return resourceTracker;
- }
-
- @Override
- protected void stopRMProxy() {
- return;
+ super(context, dispatcher, healthChecker, metrics,
+ new MyResourceTracker(context, signalContainer));
}
}
@@ -1820,7 +1756,6 @@ public void run() {
Assert.assertTrue("Test failed with exception(s)" + exceptions,
exceptions.isEmpty());
}
-
// Add new containers info into NM context each time node heart beats.
private class MyNMContext extends NMContext {
@@ -1924,31 +1859,6 @@ private void verifyNodeStartFailure(String errMessage) throws Exception {
this.registeredNodes.size());
}
- private YarnConfiguration createNMConfig(int port) throws IOException {
- YarnConfiguration conf = new YarnConfiguration();
- String localhostAddress = null;
- try {
- localhostAddress = InetAddress.getByName("localhost")
- .getCanonicalHostName();
- } catch (UnknownHostException e) {
- Assert.fail("Unable to get localhost address: " + e.getMessage());
- }
- conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
- conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
- conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
- + ServerSocketUtil.getPort(49160, 10));
- conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
- remoteLogsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
- conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
- return conf;
- }
-
- private YarnConfiguration createNMConfig() throws IOException {
- return createNMConfig(ServerSocketUtil.getPort(49170, 10));
- }
-
private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
return new NodeManager() {
@Override
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
index 0838f1e523a..3c574966be5 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
@@ -18,26 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
-import java.io.IOException;
-import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
@@ -64,6 +44,7 @@
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
@@ -72,17 +53,36 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.Records;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
/**
* Base class for all the AMRMProxyService test cases. It provides utility
@@ -773,5 +773,9 @@ public ContainerExecutor getContainerExecutor() {
getContainerStateTransitionListener() {
return null;
}
+
+ public ResourcePluginManager getResourcePluginManager() {
+ return null;
+ }
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
index e5414a587f1..0563694f004 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -30,6 +31,8 @@
import java.util.List;
+import static org.mockito.Mockito.mock;
+
public class TestResourceHandlerModule {
private static final Logger LOG =
LoggerFactory.getLogger(TestResourceHandlerModule.class);
@@ -62,7 +65,7 @@ public void testOutboundBandwidthHandler() {
//Ensure that outbound bandwidth resource handler is present in the chain
ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
- .getConfiguredResourceHandlerChain(networkEnabledConf);
+ .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
List resourceHandlers = resourceHandlerChain
.getResourceHandlerList();
//Exactly one resource handler in chain
@@ -88,7 +91,8 @@ public void testDiskResourceHandler() throws Exception {
Assert.assertNotNull(handler);
ResourceHandlerChain resourceHandlerChain =
- ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
+ ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
+ mock(Context.class));
List resourceHandlers =
resourceHandlerChain.getResourceHandlerList();
// Exactly one resource handler in chain
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
new file mode 100644
index 00000000000..8b2b1e6fb94
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -0,0 +1,337 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
+import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static org.mockito.Matchers.anyList;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestGpuResourceHandler {
+ private CGroupsHandler mockCGroupsHandler;
+ private PrivilegedOperationExecutor mockPrivilegedExecutor;
+ private GpuResourceHandlerImpl gpuResourceHandler;
+ private NMStateStoreService mockNMStateStore;
+ private ConcurrentHashMap runningContainersMap;
+
+ @Before
+ public void setup() {
+ TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
+
+ mockCGroupsHandler = mock(CGroupsHandler.class);
+ mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
+ mockNMStateStore = mock(NMStateStoreService.class);
+
+ Context nmctx = mock(Context.class);
+ when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
+ runningContainersMap = new ConcurrentHashMap<>();
+ when(nmctx.getContainers()).thenReturn(runningContainersMap);
+
+ gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
+ mockPrivilegedExecutor);
+ }
+
+ @Test
+ public void testBootStrap() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
+
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ verify(mockCGroupsHandler, times(1)).initializeCGroupController(
+ CGroupsHandler.CGroupController.DEVICES);
+ }
+
+ private static ContainerId getContainerId(int id) {
+ return ContainerId.newContainerId(ApplicationAttemptId
+ .newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
+ }
+
+ private static Container mockContainerWithGpuRequest(int id,
+ int numGpuRequest) {
+ Container c = mock(Container.class);
+ when(c.getContainerId()).thenReturn(getContainerId(id));
+
+ Resource res = Resource.newInstance(1024, 1);
+ res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
+ when(c.getResource()).thenReturn(res);
+ return c;
+ }
+
+ private void verifyDeniedDevices(ContainerId containerId,
+ List deniedDevices)
+ throws ResourceHandlerException, PrivilegedOperationException {
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, containerId.toString());
+
+ if (null != deniedDevices && !deniedDevices.isEmpty()) {
+ verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
+ new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
+ .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
+ containerId.toString(),
+ GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
+ StringUtils.join(",", deniedDevices))), true);
+ }
+ }
+
+ @Test
+ public void testAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+ /* Start container 2, asks 2 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Start container 3, ask 1 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
+
+ // devices = 0/1/3 will be blocked
+ verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
+
+ /* Start container 4, ask 0 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
+
+ // All devices will be blocked
+ verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
+
+ /* Release container-1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(3,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Release container-3, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(3));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationWithoutAllowedGpus() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 0 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
+ verifyDeniedDevices(getContainerId(1), Collections.emptyList());
+
+ /* Start container 2, asks 1 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Release container 1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationStored() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+
+ verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
+ ResourceInformation.GPU_URI,
+ Arrays.asList("0", "1", "3"));
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+ /* Start container 2, ask 0 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 0));
+
+ verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
+
+ // Store assigned resource will not be invoked.
+ verify(mockNMStateStore, never()).storeAssignedResources(
+ eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList());
+ }
+
+ @Test
+ public void testRecoverResourceAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ Container nmContainer = mock(Container.class);
+ ResourceMappings rmap = new ResourceMappings();
+ ResourceMappings.AssignedResources ar =
+ new ResourceMappings.AssignedResources();
+ ar.updateAssignedResources(Arrays.asList("1", "3"));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(1), nmContainer);
+
+ // TEST CASE
+ // Reacquire container restore state of GPU Resource Allocator.
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+
+ Map deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is not in allowed list.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=5 is not in allowed list.
+ ar.updateAssignedResources(Arrays.asList("4", "5"));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ boolean caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is already assigned.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=3 is already assigned
+ ar.updateAssignedResources(Arrays.asList("4", "3"));
+ rmap.addAssignedResources("gpu", ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
index e21eea0cdc7..2cca2774207 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
@@ -73,7 +73,7 @@
private static class MockExecutor extends ContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
}
@Override
public void startLocalizer(LocalizerStartContext ctx)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
new file mode 100644
index 00000000000..bcadf76e4bd
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
@@ -0,0 +1,261 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
+import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestResourcePluginManager extends NodeManagerTestBase {
+ private NodeManager nm;
+
+ ResourcePluginManager stubResourcePluginmanager() {
+ // Stub ResourcePluginManager
+ final ResourcePluginManager rpm = mock(ResourcePluginManager.class);
+ Map plugins = new HashMap<>();
+
+ // First resource plugin
+ ResourcePlugin resourcePlugin = mock(ResourcePlugin.class);
+ NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock(
+ NodeResourceUpdaterPlugin.class);
+ when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn(
+ nodeResourceUpdaterPlugin);
+ plugins.put("resource1", resourcePlugin);
+
+ // Second resource plugin
+ resourcePlugin = mock(ResourcePlugin.class);
+ when(resourcePlugin.createResourceHandler(any(Context.class), any(
+ CGroupsHandler.class), any(PrivilegedOperationExecutor.class)))
+ .thenReturn(new CustomizedResourceHandler());
+ plugins.put("resource2", resourcePlugin);
+ when(rpm.getNameToPlugins()).thenReturn(plugins);
+ return rpm;
+ }
+
+ @After
+ public void tearDown() {
+ if (nm != null) {
+ try {
+ ServiceOperations.stop(nm);
+ } catch (Throwable t) {
+ // ignore
+ }
+ }
+ }
+
+ private class CustomizedResourceHandler implements ResourceHandler {
+
+ @Override
+ public List bootstrap(Configuration configuration)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List preStart(Container container)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List reacquireContainer(ContainerId containerId)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List postComplete(ContainerId containerId)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List teardown()
+ throws ResourceHandlerException {
+ return null;
+ }
+ }
+
+ private class MyMockNM extends NodeManager {
+ private final ResourcePluginManager rpm;
+
+ public MyMockNM(ResourcePluginManager rpm) {
+ this.rpm = rpm;
+ }
+
+ @Override
+ protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+ ((NodeManager.NMContext)context).setResourcePluginManager(rpm);
+ return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+ metrics, new BaseResourceTrackerForTest());
+ }
+
+ @Override
+ protected ContainerManagerImpl createContainerManager(Context context,
+ ContainerExecutor exec, DeletionService del,
+ NodeStatusUpdater nodeStatusUpdater,
+ ApplicationACLsManager aclsManager,
+ LocalDirsHandlerService diskhandler) {
+ return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+ metrics, diskhandler);
+ }
+
+ @Override
+ protected ResourcePluginManager createResourcePluginManager() {
+ return rpm;
+ }
+ }
+
+ public class MyLCE extends LinuxContainerExecutor {
+ private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class);
+
+ @Override
+ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
+ return poe;
+ }
+ }
+
+ /*
+ * Make sure ResourcePluginManager is initialized during NM start up.
+ */
+ @Test(timeout = 30000)
+ public void testResourcePluginManagerInitialization() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+ nm = new MyMockNM(rpm);
+
+ YarnConfiguration conf = createNMConfig();
+ nm.init(conf);
+ verify(rpm, times(1)).initialize(
+ any(Context.class));
+ }
+
+ /*
+ * Make sure ResourcePluginManager is invoked during NM update.
+ */
+ @Test(timeout = 30000)
+ public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+
+ nm = new MyMockNM(rpm);
+
+ YarnConfiguration conf = createNMConfig();
+ nm.init(conf);
+ nm.start();
+
+ NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin =
+ rpm.getNameToPlugins().get("resource1")
+ .getNodeResourceHandlerInstance();
+
+ verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource(
+ any(Resource.class));
+ }
+
+ /*
+ * Make sure ResourcePluginManager is used to initialize ResourceHandlerChain
+ */
+ @Test(timeout = 30000)
+ public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+ final LinuxContainerExecutor lce = new MyLCE();
+
+ nm = new NodeManager() {
+ @Override
+ protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+ ((NMContext)context).setResourcePluginManager(rpm);
+ return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+ metrics, new BaseResourceTrackerForTest());
+ }
+
+ @Override
+ protected ContainerManagerImpl createContainerManager(Context context,
+ ContainerExecutor exec, DeletionService del,
+ NodeStatusUpdater nodeStatusUpdater,
+ ApplicationACLsManager aclsManager,
+ LocalDirsHandlerService diskhandler) {
+ return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+ metrics, diskhandler);
+ }
+
+ @Override
+ protected ContainerExecutor createContainerExecutor(Configuration conf) {
+ ((NMContext)this.getNMContext()).setResourcePluginManager(rpm);
+ lce.setConf(conf);
+ return lce;
+ }
+ };
+
+ YarnConfiguration conf = createNMConfig();
+
+ nm.init(conf);
+ nm.start();
+
+ ResourceHandler handler = lce.getResourceHandler();
+ Assert.assertNotNull(handler);
+ Assert.assertTrue(handler instanceof ResourceHandlerChain);
+
+ boolean newHandlerAdded = false;
+ for (ResourceHandler h : ((ResourceHandlerChain) handler)
+ .getResourceHandlerList()) {
+ if (h instanceof CustomizedResourceHandler) {
+ newHandlerAdded = true;
+ break;
+ }
+ }
+ Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded);
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
new file mode 100644
index 00000000000..83bace2c65f
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+public class TestGpuDiscoverer {
+ private String getTestParentFolder() {
+ File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
+ return f.getAbsolutePath();
+ }
+
+ private void touchFile(File f) throws IOException {
+ new FileOutputStream(f).close();
+ }
+
+ @Before
+ public void before() throws IOException {
+ String folder = getTestParentFolder();
+ File f = new File(folder);
+ FileUtils.deleteDirectory(f);
+ f.mkdirs();
+ }
+
+ @Test
+ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
+ // Only run this on demand.
+ Assume.assumeTrue(Boolean.valueOf(
+ System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest")));
+
+ // test case 1, check default setting.
+ Configuration conf = new Configuration(false);
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+ Assert.assertTrue(
+ plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+
+ // test case 2, check mandatory set path.
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ // test case 3, check mandatory set path, but binary doesn't exist so default
+ // path will be used.
+ fakeBinary.delete();
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ Assert.assertTrue(
+ plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+ }
+
+ @Test
+ public void testGpuDiscover() throws YarnException {
+ // Since this is more of a performance unit test, only run if
+ // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
+ Assume.assumeTrue(
+ Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
+ Configuration conf = new Configuration(false);
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ GpuDeviceInformation info = plugin.getGpuDeviceInformation();
+
+ Assert.assertTrue(info.getGpus().size() > 0);
+ Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
+ info.getGpus().size());
+ }
+
+ @Test
+ public void getNumberOfUsableGpusFromConfig() throws YarnException {
+ Configuration conf = new Configuration(false);
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+
+ List minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
+ Assert.assertEquals(4, minorNumbers.size());
+
+ Assert.assertTrue(0 == minorNumbers.get(0));
+ Assert.assertTrue(1 == minorNumbers.get(1));
+ Assert.assertTrue(2 == minorNumbers.get(2));
+ Assert.assertTrue(4 == minorNumbers.get(3));
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java
new file mode 100644
index 00000000000..e22597d5881
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+public class TestGpuDeviceInformationParser {
+ @Test
+ public void testParse() throws IOException, YarnException {
+ File f = new File("src/test/resources/nvidia-smi-sample-xml-output");
+ String s = FileUtils.readFileToString(f, "UTF-8");
+
+ GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+ GpuDeviceInformation info = parser.parseXml(s);
+ Assert.assertEquals("375.66", info.getDriverVersion());
+ Assert.assertEquals(2, info.getGpus().size());
+ PerGpuDeviceInformation gpu1 = info.getGpus().get(1);
+ Assert.assertEquals("Tesla P100-PCIE-12GB", gpu1.getProductName());
+ Assert.assertEquals(16384, gpu1.getGpuMemoryUsage().getTotalMemoryMiB());
+ Assert.assertEquals(10.3f,
+ gpu1.getGpuUtilizations().getOverallGpuUtilization(), 1e-6);
+ Assert.assertEquals(34f, gpu1.getTemperature().getCurrentGpuTemp(), 1e-6);
+ Assert.assertEquals(85f, gpu1.getTemperature().getMaxGpuTemp(), 1e-6);
+ Assert.assertEquals(82f, gpu1.getTemperature().getSlowThresholdGpuTemp(),
+ 1e-6);
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output
new file mode 100644
index 00000000000..5ccb72265b5
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output
@@ -0,0 +1,547 @@
+
+
+
+
+
+
+ Wed Sep 6 21:52:51 2017
+ 375.66
+ 2
+
+ Tesla P100-PCIE-12GB
+ Tesla
+ Disabled
+ Disabled
+ Disabled
+ Disabled
+ 1920
+
+ N/A
+ N/A
+
+ 0320717030197
+ GPU-28604e81-21ec-cc48-6759-bf2648b22e16
+ 0
+ 86.00.3A.00.02
+ No
+ 0x400
+ 900-2H400-0110-030
+
+ H400.0202.00.01
+ 1.1
+ 4.1
+ N/A
+
+
+ N/A
+ N/A
+
+
+ None
+
+
+ 04
+ 00
+ 0000
+ 15F710DE
+ 0000:04:00.0
+ 11DA10DE
+
+
+ 3
+ 3
+
+
+ 16x
+ 16x
+
+
+
+ N/A
+ N/A
+
+ 0
+ 0 KB/s
+ 0 KB/s
+
+ N/A
+ P0
+
+ Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+
+
+ 12193 MiB
+ 0 MiB
+ 12193 MiB
+
+
+ 16384 MiB
+ 2 MiB
+ 16382 MiB
+
+ Default
+
+ 0 %
+ 0 %
+ 0 %
+ 0 %
+
+
+ 0
+ 0
+ 0 ms
+
+
+ Enabled
+ Enabled
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+
+ 0
+
+
+
+
+ 0
+
+
+
+ No
+
+
+ 31 C
+ 85 C
+ 82 C
+
+
+ P0
+ Supported
+ 24.84 W
+ 250.00 W
+ 250.00 W
+ 250.00 W
+ 125.00 W
+ 250.00 W
+
+
+ 405 MHz
+ 405 MHz
+ 715 MHz
+ 835 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1328 MHz
+ 1328 MHz
+ 715 MHz
+ 1328 MHz
+
+
+ N/A
+ N/A
+
+
+
+ 715 MHz
+ 1328 MHz
+ 1316 MHz
+ 1303 MHz
+ 1290 MHz
+ 1278 MHz
+ 1265 MHz
+ 1252 MHz
+ 1240 MHz
+ 1227 MHz
+ 1215 MHz
+ 1202 MHz
+ 1189 MHz
+ 1177 MHz
+ 1164 MHz
+ 1151 MHz
+ 1139 MHz
+ 1126 MHz
+ 1113 MHz
+ 1101 MHz
+ 1088 MHz
+ 1075 MHz
+ 1063 MHz
+ 1050 MHz
+ 1037 MHz
+ 1025 MHz
+ 1012 MHz
+ 999 MHz
+ 987 MHz
+ 974 MHz
+ 961 MHz
+ 949 MHz
+ 936 MHz
+ 923 MHz
+ 911 MHz
+ 898 MHz
+ 885 MHz
+ 873 MHz
+ 860 MHz
+ 847 MHz
+ 835 MHz
+ 822 MHz
+ 810 MHz
+ 797 MHz
+ 784 MHz
+ 772 MHz
+ 759 MHz
+ 746 MHz
+ 734 MHz
+ 721 MHz
+ 708 MHz
+ 696 MHz
+ 683 MHz
+ 670 MHz
+ 658 MHz
+ 645 MHz
+ 632 MHz
+ 620 MHz
+ 607 MHz
+ 594 MHz
+ 582 MHz
+ 569 MHz
+ 556 MHz
+ 544 MHz
+
+
+
+
+
+
+
+
+
+ Tesla P100-PCIE-12GB
+ Tesla
+ Disabled
+ Disabled
+ Disabled
+ Disabled
+ 1920
+
+ N/A
+ N/A
+
+ 0320717031755
+ GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3
+ 1
+ 86.00.3A.00.02
+ No
+ 0x8200
+ 900-2H400-0110-030
+
+ H400.0202.00.01
+ 1.1
+ 4.1
+ N/A
+
+
+ N/A
+ N/A
+
+
+ None
+
+
+ 82
+ 00
+ 0000
+ 15F710DE
+ 0000:82:00.0
+ 11DA10DE
+
+
+ 3
+ 3
+
+
+ 16x
+ 16x
+
+
+
+ N/A
+ N/A
+
+ 0
+ 0 KB/s
+ 0 KB/s
+
+ N/A
+ P0
+
+ Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+
+
+ 12193 MiB
+ 0 MiB
+ 12193 MiB
+
+
+ 16384 MiB
+ 2 MiB
+ 16382 MiB
+
+ Default
+
+ 10.3 %
+ 0 %
+ 0 %
+ 0 %
+
+
+ 0
+ 0
+ 0 ms
+
+
+ Enabled
+ Enabled
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+
+ 0
+
+
+
+
+ 0
+
+
+
+ No
+
+
+ 34 C
+ 85 C
+ 82 C
+
+
+ P0
+ Supported
+ 25.54 W
+ 250.00 W
+ 250.00 W
+ 250.00 W
+ 125.00 W
+ 250.00 W
+
+
+ 405 MHz
+ 405 MHz
+ 715 MHz
+ 835 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1328 MHz
+ 1328 MHz
+ 715 MHz
+ 1328 MHz
+
+
+ N/A
+ N/A
+
+
+
+ 715 MHz
+ 1328 MHz
+ 1316 MHz
+ 1303 MHz
+ 1290 MHz
+ 1278 MHz
+ 1265 MHz
+ 1252 MHz
+ 1240 MHz
+ 1227 MHz
+ 1215 MHz
+ 1202 MHz
+ 1189 MHz
+ 1177 MHz
+ 1164 MHz
+ 1151 MHz
+ 1139 MHz
+ 1126 MHz
+ 1113 MHz
+ 1101 MHz
+ 1088 MHz
+ 1075 MHz
+ 1063 MHz
+ 1050 MHz
+ 1037 MHz
+ 1025 MHz
+ 1012 MHz
+ 999 MHz
+ 987 MHz
+ 974 MHz
+ 961 MHz
+ 949 MHz
+ 936 MHz
+ 923 MHz
+ 911 MHz
+ 898 MHz
+ 885 MHz
+ 873 MHz
+ 860 MHz
+ 847 MHz
+ 835 MHz
+ 822 MHz
+ 810 MHz
+ 797 MHz
+ 784 MHz
+ 772 MHz
+ 759 MHz
+ 746 MHz
+ 734 MHz
+ 721 MHz
+ 708 MHz
+ 696 MHz
+ 683 MHz
+ 670 MHz
+ 658 MHz
+ 645 MHz
+ 632 MHz
+ 620 MHz
+ 607 MHz
+ 594 MHz
+ 582 MHz
+ 569 MHz
+ 556 MHz
+ 544 MHz
+
+
+
+
+
+
+
+
+
\ No newline at end of file