gpus) {
+ this.gpus = gpus;
+ }
+
+ @javax.xml.bind.annotation.XmlElement(name = "driver_version")
+ public String getDriverVersion() {
+ return driverVersion;
+ }
+
+ public void setDriverVersion(String driverVersion) {
+ this.driverVersion = driverVersion;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
+ getDriverVersion()).append("\n");
+
+ if (gpus != null) {
+ for (PerGpuDeviceInformation gpu : gpus) {
+ sb.append("\t").append(gpu.toString()).append("\n");
+ }
+ }
+ return sb.toString();
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
new file mode 100644
index 00000000000..1bd92f63a88
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.sax.SAXSource;
+import java.io.StringReader;
+
+/**
+ * Parse XML and get GPU device information
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDeviceInformationParser {
+ private static final Logger LOG = LoggerFactory.getLogger(
+ GpuDeviceInformationParser.class);
+
+ private Unmarshaller unmarshaller = null;
+ private XMLReader xmlReader = null;
+
+ private void init()
+ throws SAXException, ParserConfigurationException, JAXBException {
+ SAXParserFactory spf = SAXParserFactory.newInstance();
+ // Disable external-dtd since by default nvidia-smi output contains
+ // in header
+ spf.setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ spf.setFeature("http://xml.org/sax/features/validation", false);
+
+ JAXBContext jaxbContext = JAXBContext.newInstance(
+ GpuDeviceInformation.class);
+
+ this.xmlReader = spf.newSAXParser().getXMLReader();
+ this.unmarshaller = jaxbContext.createUnmarshaller();
+ }
+
+ public synchronized GpuDeviceInformation parseXml(String xmlContent)
+ throws YarnException {
+ if (unmarshaller == null) {
+ try {
+ init();
+ } catch (SAXException | ParserConfigurationException | JAXBException e) {
+ LOG.error("Exception while initialize parser", e);
+ throw new YarnException(e);
+ }
+ }
+
+ InputSource inputSource = new InputSource(new StringReader(xmlContent));
+ SAXSource source = new SAXSource(xmlReader, inputSource);
+ try {
+ return (GpuDeviceInformation) unmarshaller.unmarshal(source);
+ } catch (JAXBException e) {
+ LOG.error("Exception while parsing xml", e);
+ throw new YarnException(e);
+ }
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/NMGpuResourceInfo.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/NMGpuResourceInfo.java
new file mode 100644
index 00000000000..e5855374314
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/NMGpuResourceInfo.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlRootElement;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
+
+import java.util.List;
+
+/**
+ * Gpu device information return to client when
+ * {@link org.apache.hadoop.yarn.server.nodemanager.webapp.NMWebServices#getNMResourceInfo(String)}
+ * is invoked.
+ */
+@XmlRootElement
+@XmlAccessorType(XmlAccessType.FIELD)
+public class NMGpuResourceInfo extends NMResourceInfo {
+ GpuDeviceInformation gpuDeviceInformation;
+
+ List totalGpuDevices;
+ List assignedGpuDevices;
+
+ public NMGpuResourceInfo() {
+
+ }
+
+ public NMGpuResourceInfo(GpuDeviceInformation gpuDeviceInformation,
+ List totalGpuDevices,
+ List assignedGpuDevices) {
+ this.gpuDeviceInformation = gpuDeviceInformation;
+ this.totalGpuDevices = totalGpuDevices;
+ this.assignedGpuDevices = assignedGpuDevices;
+ }
+
+ public GpuDeviceInformation getGpuDeviceInformation() {
+ return gpuDeviceInformation;
+ }
+
+ public void setGpuDeviceInformation(
+ GpuDeviceInformation gpuDeviceInformation) {
+ this.gpuDeviceInformation = gpuDeviceInformation;
+ }
+
+ public List getTotalGpuDevices() {
+ return totalGpuDevices;
+ }
+
+ public void setTotalGpuDevices(List totalGpuDevices) {
+ this.totalGpuDevices = totalGpuDevices;
+ }
+
+ public List getAssignedGpuDevices() {
+ return assignedGpuDevices;
+ }
+
+ public void setAssignedGpuDevices(
+ List assignedGpuDevices) {
+ this.assignedGpuDevices = assignedGpuDevices;
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
new file mode 100644
index 00000000000..25c2e3a1f1d
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlAdapter;
+
+/**
+ * Capture single GPU device information such as memory size, temperature,
+ * utilization.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "gpu")
+public class PerGpuDeviceInformation {
+
+ private String productName = "N/A";
+ private String uuid = "N/A";
+ private int minorNumber = -1;
+
+ private PerGpuUtilizations gpuUtilizations;
+ private PerGpuMemoryUsage gpuMemoryUsage;
+ private PerGpuTemperature temperature;
+
+ /**
+ * Convert formats like "34 C", "75.6 %" to float.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToFloatBeforeSpaceAdapter extends
+ XmlAdapter {
+ @Override
+ public String marshal(Float v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v);
+ }
+
+ @Override
+ public Float unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1f;
+ }
+
+ return Float.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ /**
+ * Convert formats like "725 MiB" to long.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToMemAdapter extends XmlAdapter {
+ @Override
+ public String marshal(Long v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v) + " MiB";
+ }
+
+ @Override
+ public Long unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1L;
+ }
+ return Long.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ @XmlElement(name = "temperature")
+ public PerGpuTemperature getTemperature() {
+ return temperature;
+ }
+
+ public void setTemperature(PerGpuTemperature temperature) {
+ this.temperature = temperature;
+ }
+
+ @XmlElement(name = "uuid")
+ public String getUuid() {
+ return uuid;
+ }
+
+ public void setUuid(String uuid) {
+ this.uuid = uuid;
+ }
+
+ @XmlElement(name = "product_name")
+ public String getProductName() {
+ return productName;
+ }
+
+ public void setProductName(String productName) {
+ this.productName = productName;
+ }
+
+ @XmlElement(name = "minor_number")
+ public int getMinorNumber() {
+ return minorNumber;
+ }
+
+ public void setMinorNumber(int minorNumber) {
+ this.minorNumber = minorNumber;
+ }
+
+ @XmlElement(name = "utilization")
+ public PerGpuUtilizations getGpuUtilizations() {
+ return gpuUtilizations;
+ }
+
+ public void setGpuUtilizations(PerGpuUtilizations utilizations) {
+ this.gpuUtilizations = utilizations;
+ }
+
+ @XmlElement(name = "fb_memory_usage")
+ public PerGpuMemoryUsage getGpuMemoryUsage() {
+ return gpuMemoryUsage;
+ }
+
+ public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
+ this.gpuMemoryUsage = gpuMemoryUsage;
+ }
+
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("ProductName=").append(productName).append(", MinorNumber=")
+ .append(minorNumber);
+
+ if (getGpuMemoryUsage() != null) {
+ sb.append(", TotalMemory=").append(
+ getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
+ }
+
+ if (getGpuUtilizations() != null) {
+ sb.append(", Utilization=").append(
+ getGpuUtilizations().getOverallGpuUtilization()).append("%");
+ }
+ return sb.toString();
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
new file mode 100644
index 00000000000..afc1a9679b7
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "fb_memory_usage")
+public class PerGpuMemoryUsage {
+ long usedMemoryMiB = -1L;
+ long availMemoryMiB = -1L;
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "used")
+ public Long getUsedMemoryMiB() {
+ return usedMemoryMiB;
+ }
+
+ public void setUsedMemoryMiB(Long usedMemoryMiB) {
+ this.usedMemoryMiB = usedMemoryMiB;
+ }
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "free")
+ public Long getAvailMemoryMiB() {
+ return availMemoryMiB;
+ }
+
+ public void setAvailMemoryMiB(Long availMemoryMiB) {
+ this.availMemoryMiB = availMemoryMiB;
+ }
+
+ public long getTotalMemoryMiB() {
+ return usedMemoryMiB + availMemoryMiB;
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
new file mode 100644
index 00000000000..ccd60cbf5e5
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * Temperature of GPU
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "temperature")
+public class PerGpuTemperature {
+ private float currentGpuTemp = Float.MIN_VALUE;
+ private float maxGpuTemp = Float.MIN_VALUE;
+ private float slowThresholdGpuTemp = Float.MIN_VALUE;
+
+ /**
+ * Get current celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp")
+ public Float getCurrentGpuTemp() {
+ return currentGpuTemp;
+ }
+
+ public void setCurrentGpuTemp(Float currentGpuTemp) {
+ this.currentGpuTemp = currentGpuTemp;
+ }
+
+ /**
+ * Get max possible celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_max_threshold")
+ public Float getMaxGpuTemp() {
+ return maxGpuTemp;
+ }
+
+ public void setMaxGpuTemp(Float maxGpuTemp) {
+ this.maxGpuTemp = maxGpuTemp;
+ }
+
+ /**
+ * Get celsius GPU temperature which could make GPU runs slower
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_slow_threshold")
+ public Float getSlowThresholdGpuTemp() {
+ return slowThresholdGpuTemp;
+ }
+
+ public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
+ this.slowThresholdGpuTemp = slowThresholdGpuTemp;
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
new file mode 100644
index 00000000000..4ef218ba7ea
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * GPU utilizations
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "utilization")
+public class PerGpuUtilizations {
+ private float overallGpuUtilization;
+
+ /**
+ * Overall percent GPU utilization
+ * @return utilization
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_util")
+ public Float getOverallGpuUtilization() {
+ return overallGpuUtilization;
+ }
+
+ public void setOverallGpuUtilization(Float overallGpuUtilization) {
+ this.overallGpuUtilization = overallGpuUtilization;
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h
index 956b38c7276..a78b077d9b2 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h
@@ -285,3 +285,5 @@ int execute_regex_match(const char *regex_str, const char *input);
* Return 0 on success.
*/
int validate_docker_image_name(const char *image_name);
+
+struct configuration* get_cfg();
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c
index 930dabe5029..9cf34a0c4f4 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c
@@ -22,6 +22,8 @@
#include "util.h"
#include "get_executable.h"
#include "utils/string-utils.h"
+#include "modules/gpu/gpu-module.h"
+#include "modules/cgroups/cgroups-operations.h"
#include
#include
@@ -241,6 +243,14 @@ static int validate_arguments(int argc, char **argv , int *operation) {
return INVALID_ARGUMENT_NUMBER;
}
+ /*
+ * Check if it is a known module, if yes, redirect to module
+ */
+ if (strcmp("--module-gpu", argv[1]) == 0) {
+ return handle_gpu_request(&update_cgroups_parameters, "gpu", argc - 1,
+ &argv[1]);
+ }
+
if (strcmp("--checksetup", argv[1]) == 0) {
*operation = CHECK_SETUP;
return 0;
@@ -325,6 +335,7 @@ static int validate_arguments(int argc, char **argv , int *operation) {
return FEATURE_DISABLED;
}
}
+
/* Now we have to validate 'run as user' operations that don't use
a 'long option' - we should fix this at some point. The validation/argument
parsing here is extensive enough that it done in a separate function */
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.c hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.c
new file mode 100644
index 00000000000..b23410928bf
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.c
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "configuration.h"
+#include "container-executor.h"
+#include "utils/string-utils.h"
+#include "utils/path-utils.h"
+#include "modules/common/module-configs.h"
+#include "modules/common/constants.h"
+#include "modules/cgroups/cgroups-operations.h"
+#include "util.h"
+
+#include
+#include
+#include
+#include
+
+#define MAX_PATH_LEN 4096
+
+static const struct section* cgroup_cfg_section = NULL;
+
+void reload_cgroups_configuration() {
+ cgroup_cfg_section = get_configuration_section(CGROUPS_SECTION_NAME, get_cfg());
+}
+
+char* get_cgroups_path_to_write(
+ const char* hierarchy_name,
+ const char* param_name,
+ const char* group_id) {
+ int failed = 0;
+ char* buffer = NULL;
+ const char* cgroups_root = get_section_value(CGROUPS_ROOT_KEY,
+ cgroup_cfg_section);
+ const char* yarn_hierarchy_name = get_section_value(
+ CGROUPS_YARN_HIERARCHY_KEY, cgroup_cfg_section);
+
+ // Make sure it is defined.
+ if (!cgroups_root || cgroups_root[0] == 0) {
+ fprintf(ERRORFILE, "%s is not defined in container-executor.cfg\n",
+ CGROUPS_ROOT_KEY);
+ failed = 1;
+ goto cleanup;
+ }
+
+ // Make sure it is defined.
+ if (!yarn_hierarchy_name || yarn_hierarchy_name[0] == 0) {
+ fprintf(ERRORFILE, "%s is not defined in container-executor.cfg\n",
+ CGROUPS_YARN_HIERARCHY_KEY);
+ failed = 1;
+ goto cleanup;
+ }
+
+ buffer = malloc(MAX_PATH_LEN + 1);
+ if (!buffer) {
+ fprintf(ERRORFILE, "Failed to allocate memory for output path.\n");
+ failed = 1;
+ goto cleanup;
+ }
+
+ // Make a path.
+ // CGroups path should not be too long.
+ if (snprintf(buffer, MAX_PATH_LEN, "%s/%s/%s/%s/%s.%s",
+ cgroups_root, hierarchy_name, yarn_hierarchy_name,
+ group_id, hierarchy_name, param_name) < 0) {
+ fprintf(ERRORFILE, "Failed to print output path.\n");
+ failed = 1;
+ goto cleanup;
+ }
+
+cleanup:
+ if (failed) {
+ if (buffer) {
+ free(buffer);
+ }
+ return NULL;
+ }
+ return buffer;
+}
+
+int update_cgroups_parameters(
+ const char* hierarchy_name,
+ const char* param_name,
+ const char* group_id,
+ const char* value) {
+#ifndef __linux
+ fprintf(ERRORFILE, "Failed to update cgroups parameters, not supported\n");
+ return -1;
+#endif
+ int failure = 0;
+
+ if (!cgroup_cfg_section) {
+ reload_cgroups_configuration();
+ }
+
+ char* full_path = get_cgroups_path_to_write(hierarchy_name, param_name,
+ group_id);
+
+ if (!full_path) {
+ fprintf(ERRORFILE,
+ "Failed to get cgroups path to write, it should be a configuration issue");
+ failure = 1;
+ goto cleanup;
+ }
+
+ if (!verify_path_safety(full_path)) {
+ failure = 1;
+ goto cleanup;
+ }
+
+ // Make sure file exists
+ struct stat sb;
+ if (stat(full_path, &sb) != 0) {
+ fprintf(ERRORFILE, "CGroups: Could not find file to write, %s", full_path);
+ failure = 1;
+ goto cleanup;
+ }
+
+ fprintf(ERRORFILE, "CGroups: Updating cgroups, path=%s, value=%s",
+ full_path, value);
+
+ // Write values to file
+ FILE *f;
+ f = fopen(full_path, "a");
+ if (!f) {
+ fprintf(ERRORFILE, "CGroups: Failed to open cgroups file, %s", full_path);
+ failure = 1;
+ goto cleanup;
+ }
+ if (fprintf(f, "%s", value) < 0) {
+ fprintf(ERRORFILE, "CGroups: Failed to write cgroups file, %s", full_path);
+ fclose(f);
+ failure = 1;
+ goto cleanup;
+ }
+ if (fclose(f) != 0) {
+ fprintf(ERRORFILE, "CGroups: Failed to close cgroups file, %s", full_path);
+ failure = 1;
+ goto cleanup;
+ }
+
+cleanup:
+ if (full_path) {
+ free(full_path);
+ }
+ return -failure;
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.h hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.h
new file mode 100644
index 00000000000..cf80bcf6059
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/cgroups/cgroups-operations.h
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _CGROUPS_OPERATIONS_H_
+#define _CGROUPS_OPERATIONS_H_
+
+#define CGROUPS_SECTION_NAME "cgroups"
+#define CGROUPS_ROOT_KEY "root"
+#define CGROUPS_YARN_HIERARCHY_KEY "yarn-hierarchy"
+
+/**
+ * Handle update CGroups parameter update requests:
+ * - hierarchy_name: e.g. devices / cpu,cpuacct
+ * - param_name: e.g. deny
+ * - group_id: e.g. container_x_y
+ * - value: e.g. "a *:* rwm"
+ *
+ * return 0 if succeeded
+ */
+int update_cgroups_parameters(
+ const char* hierarchy_name,
+ const char* param_name,
+ const char* group_id,
+ const char* value);
+
+ /**
+ * Get CGroups path to update. Visible for testing.
+ * Return 0 if succeeded
+ */
+ char* get_cgroups_path_to_write(
+ const char* hierarchy_name,
+ const char* param_name,
+ const char* group_id);
+
+ /**
+ * Reload config from filesystem, visible for testing.
+ */
+ void reload_cgroups_configuration();
+
+#endif
\ No newline at end of file
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.c hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.c
new file mode 100644
index 00000000000..1a1b164f2ba
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.c
@@ -0,0 +1,229 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "configuration.h"
+#include "container-executor.h"
+#include "utils/string-utils.h"
+#include "modules/gpu/gpu-module.h"
+#include "modules/cgroups/cgroups-operations.h"
+#include "modules/common/module-configs.h"
+#include "modules/common/constants.h"
+#include "util.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#define EXCLUDED_GPUS_OPTION "excluded_gpus"
+#define CONTAINER_ID_OPTION "container_id"
+#define DEFAULT_NVIDIA_MAJOR_NUMBER 195
+#define MAX_CONTAINER_ID_LEN 128
+
+static const struct section* cfg_section;
+
+static int internal_handle_gpu_request(
+ update_cgroups_parameters_func update_cgroups_parameters_func_p,
+ size_t n_minor_devices_to_block, int minor_devices[],
+ const char* container_id) {
+ char* allowed_minor_numbers_str = NULL;
+ int* allowed_minor_numbers = NULL;
+ size_t n_allowed_minor_numbers = 0;
+ int return_code = 0;
+
+ if (n_minor_devices_to_block == 0) {
+ // no device to block, just return;
+ return 0;
+ }
+
+ // Get major device number from cfg, if not set, major number of (Nvidia)
+ // will be the default value.
+ int major_device_number;
+ char* major_number_str = get_section_value(GPU_MAJOR_NUMBER_CONFIG_KEY,
+ cfg_section);
+ if (!major_number_str || 0 == major_number_str[0]) {
+ // Default major number of Nvidia devices
+ major_device_number = DEFAULT_NVIDIA_MAJOR_NUMBER;
+ } else {
+ major_device_number = strtol(major_number_str, NULL, 0);
+ }
+
+ // Get allowed minor device numbers from cfg, if not set, means all minor
+ // devices can be used by YARN
+ allowed_minor_numbers_str = get_section_value(
+ GPU_ALLOWED_DEVICES_MINOR_NUMBERS,
+ cfg_section);
+ if (!allowed_minor_numbers_str || 0 == allowed_minor_numbers_str[0]) {
+ allowed_minor_numbers = NULL;
+ } else {
+ int rc = get_numbers_split_by_comma(allowed_minor_numbers_str,
+ &allowed_minor_numbers,
+ &n_allowed_minor_numbers);
+ if (0 != rc) {
+ fprintf(ERRORFILE,
+ "Failed to get allowed minor device numbers from cfg, value=%s\n",
+ allowed_minor_numbers_str);
+ return_code = -1;
+ goto cleanup;
+ }
+
+ // Make sure we're trying to black devices allowed in config
+ for (int i = 0; i < n_minor_devices_to_block; i++) {
+ int found = 0;
+ for (int j = 0; j < n_allowed_minor_numbers; j++) {
+ if (minor_devices[i] == allowed_minor_numbers[j]) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ fprintf(ERRORFILE,
+ "Trying to blacklist device with minor-number=%d which is not on allowed list\n",
+ minor_devices[i]);
+ return_code = -1;
+ goto cleanup;
+ }
+ }
+ }
+
+ // Use cgroup helpers to blacklist devices
+ for (int i = 0; i < n_minor_devices_to_block; i++) {
+ char param_value[128];
+ memset(param_value, 0, sizeof(param_value));
+ snprintf(param_value, sizeof(param_value), "c %d:%d rwm",
+ major_device_number, minor_devices[i]);
+
+ int rc = update_cgroups_parameters_func_p("devices", "deny",
+ container_id, param_value);
+
+ if (0 != rc) {
+ fprintf(ERRORFILE, "CGroups: Failed to update cgroups\n");
+ return_code = -1;
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (major_number_str) {
+ free(major_number_str);
+ }
+ if (allowed_minor_numbers) {
+ free(allowed_minor_numbers);
+ }
+ if (allowed_minor_numbers_str) {
+ free(allowed_minor_numbers_str);
+ }
+
+ return return_code;
+}
+
+void reload_gpu_configuration() {
+ cfg_section = get_configuration_section(GPU_MODULE_SECTION_NAME, get_cfg());
+}
+
+/*
+ * Format of GPU request commandline:
+ *
+ * c-e gpu --excluded_gpus 0,1,3 --container_id container_x_y
+ */
+int handle_gpu_request(update_cgroups_parameters_func func,
+ const char* module_name, int module_argc, char** module_argv) {
+ if (!cfg_section) {
+ reload_gpu_configuration();
+ }
+
+ if (!module_enabled(cfg_section, GPU_MODULE_SECTION_NAME)) {
+ fprintf(ERRORFILE,
+ "Please make sure gpu module is enabled before using it.\n");
+ return -1;
+ }
+
+ static struct option long_options[] = {
+ {EXCLUDED_GPUS_OPTION, required_argument, 0, 'e' },
+ {CONTAINER_ID_OPTION, required_argument, 0, 'c' },
+ {0, 0, 0, 0}
+ };
+
+ int rc = 0;
+ int c = 0;
+ int option_index = 0;
+
+ int* minor_devices = NULL;
+ char container_id[MAX_CONTAINER_ID_LEN];
+ memset(container_id, 0, sizeof(container_id));
+ size_t n_minor_devices_to_block = 0;
+ int failed = 0;
+
+ optind = 1;
+ while((c = getopt_long(module_argc, module_argv, "e:c:",
+ long_options, &option_index)) != -1) {
+ switch(c) {
+ case 'e':
+ rc = get_numbers_split_by_comma(optarg, &minor_devices,
+ &n_minor_devices_to_block);
+ if (0 != rc) {
+ fprintf(ERRORFILE,
+ "Failed to get minor devices number from command line, value=%s\n",
+ optarg);
+ failed = 1;
+ goto cleanup;
+ }
+ break;
+ case 'c':
+ if (!validate_container_id(optarg)) {
+ fprintf(ERRORFILE,
+ "Specified container_id=%s is invalid\n", optarg);
+ failed = 1;
+ goto cleanup;
+ }
+ strncpy(container_id, optarg, MAX_CONTAINER_ID_LEN);
+ break;
+ default:
+ fprintf(ERRORFILE,
+ "Unknown option in gpu command character %d %c, optionindex = %d\n",
+ c, c, optind);
+ failed = 1;
+ goto cleanup;
+ }
+ }
+
+ if (0 == container_id[0]) {
+ fprintf(ERRORFILE,
+ "[%s] --container_id must be specified.\n", __func__);
+ failed = 1;
+ goto cleanup;
+ }
+
+ if (!minor_devices) {
+ // Minor devices is null, skip following call.
+ fprintf(ERRORFILE, "is not specified, skip cgroups call.\n");
+ goto cleanup;
+ }
+
+ failed = internal_handle_gpu_request(func, n_minor_devices_to_block,
+ minor_devices,
+ container_id);
+
+cleanup:
+ if (minor_devices) {
+ free(minor_devices);
+ }
+ return failed;
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.h hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.h
new file mode 100644
index 00000000000..59d4c7e9cb1
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/modules/gpu/gpu-module.h
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __FreeBSD__
+#define _WITH_GETLINE
+#endif
+
+#ifndef _MODULES_GPU_GPU_MUDULE_H_
+#define _MODULES_GPU_GPU_MUDULE_H_
+
+#define GPU_MAJOR_NUMBER_CONFIG_KEY "gpu.major-device-number"
+#define GPU_ALLOWED_DEVICES_MINOR_NUMBERS "gpu.allowed-device-minor-numbers"
+#define GPU_MODULE_SECTION_NAME "gpu"
+
+// For unit test stubbing
+typedef int (*update_cgroups_parameters_func)(const char*, const char*,
+ const char*, const char*);
+
+/**
+ * Handle gpu requests
+ */
+int handle_gpu_request(update_cgroups_parameters_func func,
+ const char* module_name, int module_argc, char** module_argv);
+
+/**
+ * Reload config from filesystem, visible for testing.
+ */
+void reload_gpu_configuration();
+
+#endif
\ No newline at end of file
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc
new file mode 100644
index 00000000000..8ffbe884a64
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/cgroups/test-cgroups-module.cc
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+extern "C" {
+#include "configuration.h"
+#include "container-executor.h"
+#include "modules/cgroups/cgroups-operations.h"
+#include "test/test-container-executor-common.h"
+#include "util.h"
+}
+
+namespace ContainerExecutor {
+
+class TestCGroupsModule : public ::testing::Test {
+protected:
+ virtual void SetUp() {
+ if (mkdirs(TEST_ROOT, 0755) != 0) {
+ fprintf(ERRORFILE, "Failed to mkdir TEST_ROOT: %s\n", TEST_ROOT);
+ exit(1);
+ }
+ LOGFILE = stdout;
+ ERRORFILE = stderr;
+ }
+
+ virtual void TearDown() {}
+};
+
+TEST_F(TestCGroupsModule, test_cgroups_get_path_without_define_root) {
+ // Write config file.
+ const char *filename = TEST_ROOT "/test_cgroups_get_path_without_root.cfg";
+ FILE *file = fopen(filename, "w");
+ if (file == NULL) {
+ printf("FAIL: Could not open configuration file: %s\n", filename);
+ exit(1);
+ }
+ fprintf(file, "[cgroups]\n");
+ fprintf(file, "yarn-hierarchy=yarn\n");
+ fclose(file);
+
+ // Read config file
+ read_executor_config(filename);
+ reload_cgroups_configuration();
+
+ char* path = get_cgroups_path_to_write("devices", "deny", "container_1");
+
+ ASSERT_TRUE(NULL == path) << "Should fail.\n";
+}
+
+TEST_F(TestCGroupsModule, test_cgroups_get_path_without_define_yarn_hierarchy) {
+ // Write config file.
+ const char *filename = TEST_ROOT "/test_cgroups_get_path_without_root.cfg";
+ FILE *file = fopen(filename, "w");
+
+ ASSERT_TRUE(file) << "FAIL: Could not open configuration file: " << filename
+ << "\n";
+ fprintf(file, "[cgroups]\n");
+ fprintf(file, "root=/sys/fs/cgroups\n");
+ fclose(file);
+
+ // Read config file
+ read_executor_config(filename);
+ reload_cgroups_configuration();
+ char* path = get_cgroups_path_to_write("devices", "deny", "container_1");
+
+ ASSERT_TRUE(NULL == path) << "Should fail.\n";
+}
+
+TEST_F(TestCGroupsModule, test_cgroups_get_path_succeeded) {
+ // Write config file.
+ const char *filename = TEST_ROOT "/test_cgroups_get_path.cfg";
+ FILE *file = fopen(filename, "w");
+
+ ASSERT_TRUE(file) << "FAIL: Could not open configuration file\n";
+ fprintf(file, "[cgroups]\n");
+ fprintf(file, "root=/sys/fs/cgroups \n");
+ fprintf(file, "yarn-hierarchy=yarn \n");
+ fclose(file);
+
+ // Read config file
+ read_executor_config(filename);
+ reload_cgroups_configuration();
+
+ char* path = get_cgroups_path_to_write("devices", "deny", "container_1");
+ ASSERT_TRUE(NULL != path) << "Should success.\n";
+
+ const char *EXPECTED =
+ "/sys/fs/cgroups/devices/yarn/container_1/devices.deny";
+
+ ASSERT_STREQ(EXPECTED, path)
+ << "Return cgroup-path-to-write is not expected\n";
+}
+} // namespace ContainerExecutor
\ No newline at end of file
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/gpu/test-gpu-module.cc hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/gpu/test-gpu-module.cc
new file mode 100644
index 00000000000..b3d93dcecf3
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/modules/gpu/test-gpu-module.cc
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+extern "C" {
+#include "configuration.h"
+#include "container-executor.h"
+#include "modules/cgroups/cgroups-operations.h"
+#include "modules/gpu/gpu-module.h"
+#include "test/test-container-executor-common.h"
+#include "util.h"
+}
+
+namespace ContainerExecutor {
+
+class TestGpuModule : public ::testing::Test {
+protected:
+ virtual void SetUp() {
+ if (mkdirs(TEST_ROOT, 0755) != 0) {
+ fprintf(ERRORFILE, "Failed to mkdir TEST_ROOT: %s\n", TEST_ROOT);
+ exit(1);
+ }
+ LOGFILE = stdout;
+ ERRORFILE = stderr;
+ }
+
+ virtual void TearDown() {
+
+ }
+};
+
+static std::vector cgroups_parameters_invoked;
+
+static int mock_update_cgroups_parameters(
+ const char* controller_name,
+ const char* param_name,
+ const char* group_id,
+ const char* value) {
+ char* buf = (char*) malloc(128);
+ strcpy(buf, controller_name);
+ cgroups_parameters_invoked.push_back(buf);
+
+ buf = (char*) malloc(128);
+ strcpy(buf, param_name);
+ cgroups_parameters_invoked.push_back(buf);
+
+ buf = (char*) malloc(128);
+ strcpy(buf, group_id);
+ cgroups_parameters_invoked.push_back(buf);
+
+ buf = (char*) malloc(128);
+ strcpy(buf, value);
+ cgroups_parameters_invoked.push_back(buf);
+ return 0;
+}
+
+static void verify_param_updated_to_cgroups(
+ int argc, const char** argv) {
+ ASSERT_EQ(argc, cgroups_parameters_invoked.size());
+
+ int offset = 0;
+ while (offset < argc) {
+ ASSERT_STREQ(argv[offset], cgroups_parameters_invoked[offset]);
+ offset++;
+ }
+}
+
+static void write_and_load_gpu_module_to_cfg(const char* cfg_filepath, int enabled) {
+ FILE *file = fopen(cfg_filepath, "w");
+ if (file == NULL) {
+ printf("FAIL: Could not open configuration file: %s\n", cfg_filepath);
+ exit(1);
+ }
+ fprintf(file, "[gpu]\n");
+ if (enabled) {
+ fprintf(file, "module.enabled=true\n");
+ } else {
+ fprintf(file, "module.enabled=false\n");
+ }
+ fclose(file);
+
+ // Read config file
+ read_executor_config(cfg_filepath);
+ reload_gpu_configuration();
+}
+
+static void test_gpu_module_enabled_disabled(int enabled) {
+ // Write config file.
+ const char *filename = TEST_ROOT "/test_cgroups_module_enabled_disabled.cfg";
+ write_and_load_gpu_module_to_cfg(filename, enabled);
+
+ char* argv[] = { (char*) "--module-gpu", (char*) "--excluded_gpus", (char*) "0,1",
+ (char*) "--container_id",
+ (char*) "container_1498064906505_0001_01_000001" };
+
+ int rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 5, argv);
+
+ int EXPECTED_RC;
+ if (enabled) {
+ EXPECTED_RC = 0;
+ } else {
+ EXPECTED_RC = -1;
+ }
+ ASSERT_EQ(EXPECTED_RC, rc);
+}
+
+TEST_F(TestGpuModule, test_verify_gpu_module_calls_cgroup_parameter) {
+ // Write config file.
+ const char *filename = TEST_ROOT "/test_verify_gpu_module_calls_cgroup_parameter.cfg";
+ write_and_load_gpu_module_to_cfg(filename, 1);
+
+ char* container_id = (char*) "container_1498064906505_0001_01_000001";
+ char* argv[] = { (char*) "--module-gpu", (char*) "--excluded_gpus", (char*) "0,1",
+ (char*) "--container_id",
+ container_id };
+
+ /* Test case 1: block 2 devices */
+ cgroups_parameters_invoked.clear();
+ int rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 5, argv);
+ ASSERT_EQ(0, rc) << "Should success.\n";
+
+ // Verify cgroups parameters
+ const char* expected_cgroups_argv[] = { "devices", "deny", container_id, "c 195:0 rwm",
+ "devices", "deny", container_id, "c 195:1 rwm"};
+ verify_param_updated_to_cgroups(8, expected_cgroups_argv);
+
+ /* Test case 2: block 0 devices */
+ cgroups_parameters_invoked.clear();
+ char* argv_1[] = { (char*) "--module-gpu", (char*) "--container_id", container_id };
+ rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 3, argv_1);
+ ASSERT_EQ(0, rc) << "Should success.\n";
+
+ // Verify cgroups parameters
+ verify_param_updated_to_cgroups(0, NULL);
+
+ /* Test case 3: block 2 non-sequential devices */
+ cgroups_parameters_invoked.clear();
+ char* argv_2[] = { (char*) "--module-gpu", (char*) "--excluded_gpus", (char*) "1,3",
+ (char*) "--container_id", container_id };
+ rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 5, argv_2);
+ ASSERT_EQ(0, rc) << "Should success.\n";
+
+ // Verify cgroups parameters
+ const char* expected_cgroups_argv_2[] = { "devices", "deny", container_id, "c 195:1 rwm",
+ "devices", "deny", container_id, "c 195:3 rwm"};
+ verify_param_updated_to_cgroups(8, expected_cgroups_argv_2);
+}
+
+TEST_F(TestGpuModule, test_illegal_cli_parameters) {
+ // Write config file.
+ const char *filename = TEST_ROOT "/test_illegal_cli_parameters.cfg";
+ write_and_load_gpu_module_to_cfg(filename, 1);
+
+ // Illegal container id - 1
+ char* argv[] = { (char*) "--module-gpu", (char*) "--excluded_gpus", (char*) "0,1",
+ (char*) "--container_id", (char*) "xxxx" };
+ int rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 5, argv);
+ ASSERT_NE(0, rc) << "Should fail.\n";
+
+ // Illegal container id - 2
+ char* argv_1[] = { (char*) "--module-gpu", (char*) "--excluded_gpus", (char*) "0,1",
+ (char*) "--container_id", (char*) "container_1" };
+ rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 5, argv_1);
+ ASSERT_NE(0, rc) << "Should fail.\n";
+
+ // Illegal container id - 3
+ char* argv_2[] = { (char*) "--module-gpu", (char*) "--excluded_gpus", (char*) "0,1" };
+ rc = handle_gpu_request(&mock_update_cgroups_parameters,
+ "gpu", 3, argv_2);
+ ASSERT_NE(0, rc) << "Should fail.\n";
+}
+
+TEST_F(TestGpuModule, test_gpu_module_disabled) {
+ test_gpu_module_enabled_disabled(0);
+}
+
+TEST_F(TestGpuModule, test_gpu_module_enabled) {
+ test_gpu_module_enabled_disabled(1);
+}
+} // namespace ContainerExecutor
\ No newline at end of file
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
index 9e85b3fbf52..235ea77a270 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
@@ -1392,7 +1392,6 @@ int main(int argc, char **argv) {
#endif
test_trim_function();
- run("rm -fr " TEST_ROOT);
printf("\nFinished tests\n");
free(current_username);
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
new file mode 100644
index 00000000000..13b3ee91bdc
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.net.ServerSocketUtil;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.api.ResourceTracker;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
+import org.junit.Assert;
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+public class NodeManagerTestBase {
+ // temp fix until metrics system can auto-detect itself running in unit test:
+ static {
+ DefaultMetricsSystem.setMiniClusterMode(true);
+ }
+
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(TestNodeStatusUpdater.class);
+ protected static final File basedir =
+ new File("target", TestNodeStatusUpdater.class.getName());
+ protected static final File nmLocalDir = new File(basedir, "nm0");
+ protected static final File tmpDir = new File(basedir, "tmpDir");
+ protected static final File remoteLogsDir = new File(basedir, "remotelogs");
+ protected static final File logsDir = new File(basedir, "logs");
+ protected static final RecordFactory recordFactory = RecordFactoryProvider
+ .getRecordFactory(null);
+ protected Configuration conf;
+
+ protected YarnConfiguration createNMConfig() throws IOException {
+ return createNMConfig(ServerSocketUtil.getPort(49170, 10));
+ }
+
+ protected YarnConfiguration createNMConfig(int port) throws IOException {
+ YarnConfiguration conf = new YarnConfiguration();
+ String localhostAddress = null;
+ try {
+ localhostAddress = InetAddress.getByName("localhost")
+ .getCanonicalHostName();
+ } catch (UnknownHostException e) {
+ Assert.fail("Unable to get localhost address: " + e.getMessage());
+ }
+ conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
+ conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
+ conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+ + ServerSocketUtil.getPort(49160, 10));
+ conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
+ remoteLogsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+ conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
+ return conf;
+ }
+
+ public static class BaseResourceTrackerForTest implements ResourceTracker {
+ @Override
+ public RegisterNodeManagerResponse registerNodeManager(
+ RegisterNodeManagerRequest request) throws YarnException, IOException {
+ return new RegisterNodeManagerResponsePBImpl();
+ }
+
+ @Override
+ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
+ throws YarnException, IOException {
+ return new NodeHeartbeatResponsePBImpl();
+ }
+
+ @Override
+ public UnRegisterNodeManagerResponse unRegisterNodeManager(
+ UnRegisterNodeManagerRequest request)
+ throws YarnException, IOException {
+ return new UnRegisterNodeManagerResponsePBImpl();
+ }
+ }
+
+ protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
+ public ResourceTracker resourceTracker;
+ protected Context context;
+
+ public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
+ NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
+ ResourceTracker resourceTracker) {
+ super(context, dispatcher, healthChecker, metrics);
+ this.context = context;
+ this.resourceTracker = resourceTracker;
+ }
+ @Override
+ protected ResourceTracker getRMClient() {
+ return resourceTracker;
+ }
+
+ @Override
+ protected void stopRMProxy() {
+ return;
+ }
+ }
+
+ public class MyContainerManager extends ContainerManagerImpl {
+ public boolean signaled = false;
+
+ public MyContainerManager(Context context, ContainerExecutor exec,
+ DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
+ NodeManagerMetrics metrics,
+ LocalDirsHandlerService dirsHandler) {
+ super(context, exec, deletionContext, nodeStatusUpdater,
+ metrics, dirsHandler);
+ }
+
+ @Override
+ public void handle(ContainerManagerEvent event) {
+ if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
+ signaled = true;
+ }
+ }
+ }
+
+ @Before
+ public void setUp() throws IOException {
+ nmLocalDir.mkdirs();
+ tmpDir.mkdirs();
+ logsDir.mkdirs();
+ remoteLogsDir.mkdirs();
+ conf = createNMConfig();
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
index 2e9eff529cd..9b180c7eff6 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
@@ -178,7 +178,7 @@ public void testDirPermissions() throws Exception {
FileContext lfs = FileContext.getLocalFSFileContext(conf);
DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
executor.setConf(conf);
- executor.init();
+ executor.init(null);
try {
executor.createUserLocalDirs(localDirs, user);
@@ -317,7 +317,7 @@ public Object answer(InvocationOnMock invocationOnMock)
Path workDir = localDir;
Path pidFile = new Path(workDir, "pid.txt");
- mockExec.init();
+ mockExec.init(null);
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
.setContainer(container)
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java
index f1194c9c095..7e1752b737b 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java
@@ -116,7 +116,7 @@ public void tearDown() {
public void testContainerInitSecure() throws IOException {
dockerContainerExecutor.getConf().set(
CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
- dockerContainerExecutor.init();
+ dockerContainerExecutor.init(mock(Context.class));
}
@Test(expected = IllegalArgumentException.class)
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index cf8d977c2bf..95c8f5e685c 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -628,7 +628,7 @@ public void testPostExecuteAfterReacquisition() throws Exception {
LinuxContainerExecutor lce = new LinuxContainerExecutor();
lce.setConf(conf);
try {
- lce.init();
+ lce.init(null);
} catch (IOException e) {
// expected if LCE isn't setup right, but not necessary for this test
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
index 79b88cf5eed..249e017dc43 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
@@ -426,7 +426,7 @@ public Object answer(InvocationOnMock invocationOnMock)
@Test
public void testInit() throws Exception {
- mockExec.init();
+ mockExec.init(mock(Context.class));
assertEquals(Arrays.asList("--checksetup"), readMockParams());
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
index 92797116075..b31215b0f3d 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
@@ -37,7 +37,7 @@
public static final class InvalidContainerExecutor extends
DefaultContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
throw new IOException("dummy executor init called");
}
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
index 055dab44897..533cf2a6c7b 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
@@ -20,16 +20,14 @@
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
-import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
@@ -80,8 +78,6 @@
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
-import org.apache.hadoop.yarn.factories.RecordFactory;
-import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
@@ -117,41 +113,14 @@
import org.junit.Test;
@SuppressWarnings("rawtypes")
-public class TestNodeStatusUpdater {
-
- // temp fix until metrics system can auto-detect itself running in unit test:
- static {
- DefaultMetricsSystem.setMiniClusterMode(true);
- }
-
- static final Logger LOG =
- LoggerFactory.getLogger(TestNodeStatusUpdater.class);
- static final File basedir =
- new File("target", TestNodeStatusUpdater.class.getName());
- static final File nmLocalDir = new File(basedir, "nm0");
- static final File tmpDir = new File(basedir, "tmpDir");
- static final File remoteLogsDir = new File(basedir, "remotelogs");
- static final File logsDir = new File(basedir, "logs");
- private static final RecordFactory recordFactory = RecordFactoryProvider
- .getRecordFactory(null);
-
+public class TestNodeStatusUpdater extends NodeManagerTestBase {
volatile int heartBeatID = 0;
volatile Throwable nmStartError = null;
private final List registeredNodes = new ArrayList();
private boolean triggered = false;
- private Configuration conf;
private NodeManager nm;
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
- @Before
- public void setUp() throws IOException {
- nmLocalDir.mkdirs();
- tmpDir.mkdirs();
- logsDir.mkdirs();
- remoteLogsDir.mkdirs();
- conf = createNMConfig();
- }
-
@After
public void tearDown() {
this.registeredNodes.clear();
@@ -332,29 +301,7 @@ public UnRegisterNodeManagerResponse unRegisterNodeManager(
}
}
- private class MyContainerManager extends ContainerManagerImpl {
- public boolean signaled = false;
-
- public MyContainerManager(Context context, ContainerExecutor exec,
- DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
- NodeManagerMetrics metrics,
- LocalDirsHandlerService dirsHandler) {
- super(context, exec, deletionContext, nodeStatusUpdater,
- metrics, dirsHandler);
- }
-
- @Override
- public void handle(ContainerManagerEvent event) {
- if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
- signaled = true;
- }
- }
- }
-
- private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
- public ResourceTracker resourceTracker;
- private Context context;
-
+ private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
this(context, dispatcher, healthChecker, metrics, false);
@@ -363,19 +310,8 @@ public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
boolean signalContainer) {
- super(context, dispatcher, healthChecker, metrics);
- this.context = context;
- resourceTracker = new MyResourceTracker(this.context, signalContainer);
- }
-
- @Override
- protected ResourceTracker getRMClient() {
- return resourceTracker;
- }
-
- @Override
- protected void stopRMProxy() {
- return;
+ super(context, dispatcher, healthChecker, metrics,
+ new MyResourceTracker(context, signalContainer));
}
}
@@ -1818,7 +1754,6 @@ public void run() {
Assert.assertTrue("Test failed with exception(s)" + exceptions,
exceptions.isEmpty());
}
-
// Add new containers info into NM context each time node heart beats.
private class MyNMContext extends NMContext {
@@ -1922,31 +1857,6 @@ private void verifyNodeStartFailure(String errMessage) throws Exception {
this.registeredNodes.size());
}
- private YarnConfiguration createNMConfig(int port) throws IOException {
- YarnConfiguration conf = new YarnConfiguration();
- String localhostAddress = null;
- try {
- localhostAddress = InetAddress.getByName("localhost")
- .getCanonicalHostName();
- } catch (UnknownHostException e) {
- Assert.fail("Unable to get localhost address: " + e.getMessage());
- }
- conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
- conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
- conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
- + ServerSocketUtil.getPort(49160, 10));
- conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
- remoteLogsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
- conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
- return conf;
- }
-
- private YarnConfiguration createNMConfig() throws IOException {
- return createNMConfig(ServerSocketUtil.getPort(49170, 10));
- }
-
private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
return new NodeManager() {
@Override
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
index 3c432d30338..4b4f3566f31 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
@@ -18,26 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
-import java.io.IOException;
-import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
@@ -66,6 +46,7 @@
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
@@ -74,18 +55,37 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.Records;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
/**
* Base class for all the AMRMProxyService test cases. It provides utility
@@ -805,5 +805,9 @@ public void setNMTimelinePublisher(NMTimelinePublisher nmMetricsPublisher) {
public NMTimelinePublisher getNMTimelinePublisher() {
return null;
}
+
+ public ResourcePluginManager getResourcePluginManager() {
+ return null;
+ }
}
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 8980a49d51b..52fa9f3eda1 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -31,6 +31,7 @@
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
+import java.io.Serializable;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
@@ -91,6 +92,7 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
@@ -110,6 +112,7 @@
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.timeline.TimelineUtils;
+import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -457,7 +460,7 @@ public void testContainerResizeRecovery() throws Exception {
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
- Context context = createContext(conf, stateStore);
+ context = createContext(conf, stateStore);
ContainerManagerImpl cm = createContainerManager(context, delSrvc);
((NMContext) context).setContainerManager(cm);
cm.init(conf);
@@ -467,55 +470,12 @@ public void testContainerResizeRecovery() throws Exception {
ApplicationAttemptId attemptId =
ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
- Map containerEnv = new HashMap<>();
- setFlowContext(containerEnv, "app_name1", appId);
- Map serviceData = Collections.emptyMap();
- Credentials containerCreds = new Credentials();
- DataOutputBuffer dob = new DataOutputBuffer();
- containerCreds.writeTokenStorageToStream(dob);
- ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0,
- dob.getLength());
- Map acls = Collections.emptyMap();
- File tmpDir = new File("target",
- this.getClass().getSimpleName() + "-tmpDir");
- File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile");
- PrintWriter fileWriter = new PrintWriter(scriptFile);
- if (Shell.WINDOWS) {
- fileWriter.println("@ping -n 100 127.0.0.1 >nul");
- } else {
- fileWriter.write("\numask 0");
- fileWriter.write("\nexec sleep 100");
- }
- fileWriter.close();
- FileContext localFS = FileContext.getLocalFSFileContext();
- URL resource_alpha =
- URL.fromPath(localFS
- .makeQualified(new Path(scriptFile.getAbsolutePath())));
- LocalResource rsrc_alpha = RecordFactoryProvider
- .getRecordFactory(null).newRecordInstance(LocalResource.class);
- rsrc_alpha.setResource(resource_alpha);
- rsrc_alpha.setSize(-1);
- rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
- rsrc_alpha.setType(LocalResourceType.FILE);
- rsrc_alpha.setTimestamp(scriptFile.lastModified());
- String destinationFile = "dest_file";
- Map localResources = new HashMap<>();
- localResources.put(destinationFile, rsrc_alpha);
- List commands =
- Arrays.asList(Shell.getRunScriptCommand(scriptFile));
- ContainerLaunchContext clc = ContainerLaunchContext.newInstance(
- localResources, containerEnv, commands, serviceData,
- containerTokens, acls);
- StartContainersResponse startResponse = startContainer(
- context, cm, cid, clc, null);
- assertTrue(startResponse.getFailedRequests().isEmpty());
- assertEquals(1, context.getApplications().size());
+
+ commonLaunchContainer(appId, cid, cm);
+
Application app = context.getApplications().get(appId);
assertNotNull(app);
- // make sure the container reaches RUNNING state
- waitForNMContainerState(cm, cid,
- org.apache.hadoop.yarn.server.nodemanager
- .containermanager.container.ContainerState.RUNNING);
+
Resource targetResource = Resource.newInstance(2048, 2);
ContainerUpdateResponse updateResponse =
updateContainers(context, cm, cid, targetResource);
@@ -538,6 +498,63 @@ public void testContainerResizeRecovery() throws Exception {
assertEquals(targetResource, containerStatus.getCapability());
}
+ @Test
+ public void testResourceMappingRecoveryForContainer() throws Exception {
+ conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
+ conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
+ NMStateStoreService stateStore = new NMMemoryStateStoreService();
+ stateStore.init(conf);
+ stateStore.start();
+ context = createContext(conf, stateStore);
+ ContainerManagerImpl cm = createContainerManager(context, delSrvc);
+ ((NMContext) context).setContainerManager(cm);
+ cm.init(conf);
+ cm.start();
+
+ // add an application by starting a container
+ ApplicationId appId = ApplicationId.newInstance(0, 1);
+ ApplicationAttemptId attemptId =
+ ApplicationAttemptId.newInstance(appId, 1);
+ ContainerId cid = ContainerId.newContainerId(attemptId, 1);
+
+ commonLaunchContainer(appId, cid, cm);
+
+ Container nmContainer = context.getContainers().get(cid);
+
+ Application app = context.getApplications().get(appId);
+ assertNotNull(app);
+
+ // store resource mapping of the container
+ List gpuResources =
+ Arrays.asList("1", "2", "3");
+ stateStore.storeAssignedResources(nmContainer, "gpu", gpuResources);
+ List numaResources = Arrays.asList("numa1");
+ stateStore.storeAssignedResources(nmContainer, "numa", numaResources);
+ List fpgaResources =
+ Arrays.asList("fpga1", "fpga2");
+ stateStore.storeAssignedResources(nmContainer, "fpga", fpgaResources);
+
+ cm.stop();
+ context = createContext(conf, stateStore);
+ cm = createContainerManager(context);
+ ((NMContext) context).setContainerManager(cm);
+ cm.init(conf);
+ cm.start();
+ assertEquals(1, context.getApplications().size());
+ app = context.getApplications().get(appId);
+ assertNotNull(app);
+
+ Assert.assertNotNull(nmContainer);
+ ResourceMappings resourceMappings = nmContainer.getResourceMappings();
+ List assignedResource = resourceMappings
+ .getAssignedResources("gpu");
+ Assert.assertTrue(assignedResource.equals(gpuResources));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("numa").equals(numaResources));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("fpga").equals(fpgaResources));
+ }
+
@Test
public void testContainerCleanupOnShutdown() throws Exception {
ApplicationId appId = ApplicationId.newInstance(0, 1);
@@ -610,6 +627,57 @@ public void testContainerCleanupOnShutdown() throws Exception {
verify(cm, never()).handle(isA(CMgrCompletedAppsEvent.class));
}
+ private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
+ ContainerManagerImpl cm) throws Exception {
+ Map containerEnv = new HashMap<>();
+ setFlowContext(containerEnv, "app_name1", appId);
+ Map serviceData = Collections.emptyMap();
+ Credentials containerCreds = new Credentials();
+ DataOutputBuffer dob = new DataOutputBuffer();
+ containerCreds.writeTokenStorageToStream(dob);
+ ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0,
+ dob.getLength());
+ Map acls = Collections.emptyMap();
+ File tmpDir = new File("target",
+ this.getClass().getSimpleName() + "-tmpDir");
+ File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile");
+ PrintWriter fileWriter = new PrintWriter(scriptFile);
+ if (Shell.WINDOWS) {
+ fileWriter.println("@ping -n 100 127.0.0.1 >nul");
+ } else {
+ fileWriter.write("\numask 0");
+ fileWriter.write("\nexec sleep 100");
+ }
+ fileWriter.close();
+ FileContext localFS = FileContext.getLocalFSFileContext();
+ URL resource_alpha =
+ URL.fromPath(localFS
+ .makeQualified(new Path(scriptFile.getAbsolutePath())));
+ LocalResource rsrc_alpha = RecordFactoryProvider
+ .getRecordFactory(null).newRecordInstance(LocalResource.class);
+ rsrc_alpha.setResource(resource_alpha);
+ rsrc_alpha.setSize(-1);
+ rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
+ rsrc_alpha.setType(LocalResourceType.FILE);
+ rsrc_alpha.setTimestamp(scriptFile.lastModified());
+ String destinationFile = "dest_file";
+ Map localResources = new HashMap<>();
+ localResources.put(destinationFile, rsrc_alpha);
+ List commands =
+ Arrays.asList(Shell.getRunScriptCommand(scriptFile));
+ ContainerLaunchContext clc = ContainerLaunchContext.newInstance(
+ localResources, containerEnv, commands, serviceData,
+ containerTokens, acls);
+ StartContainersResponse startResponse = startContainer(
+ context, cm, cid, clc, null);
+ assertTrue(startResponse.getFailedRequests().isEmpty());
+ assertEquals(1, context.getApplications().size());
+ // make sure the container reaches RUNNING state
+ waitForNMContainerState(cm, cid,
+ org.apache.hadoop.yarn.server.nodemanager
+ .containermanager.container.ContainerState.RUNNING);
+ }
+
private ContainerManagerImpl createContainerManager(Context context,
DeletionService delSrvc) {
return new ContainerManagerImpl(context, exec, delSrvc,
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
index e5414a587f1..0563694f004 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -30,6 +31,8 @@
import java.util.List;
+import static org.mockito.Mockito.mock;
+
public class TestResourceHandlerModule {
private static final Logger LOG =
LoggerFactory.getLogger(TestResourceHandlerModule.class);
@@ -62,7 +65,7 @@ public void testOutboundBandwidthHandler() {
//Ensure that outbound bandwidth resource handler is present in the chain
ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
- .getConfiguredResourceHandlerChain(networkEnabledConf);
+ .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
List resourceHandlers = resourceHandlerChain
.getResourceHandlerList();
//Exactly one resource handler in chain
@@ -88,7 +91,8 @@ public void testDiskResourceHandler() throws Exception {
Assert.assertNotNull(handler);
ResourceHandlerChain resourceHandlerChain =
- ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
+ ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
+ mock(Context.class));
List resourceHandlers =
resourceHandlerChain.getResourceHandlerList();
// Exactly one resource handler in chain
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
new file mode 100644
index 00000000000..7a3bd028994
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -0,0 +1,480 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
+import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyList;
+import static org.mockito.Matchers.anyListOf;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestGpuResourceHandler {
+ private CGroupsHandler mockCGroupsHandler;
+ private PrivilegedOperationExecutor mockPrivilegedExecutor;
+ private GpuResourceHandlerImpl gpuResourceHandler;
+ private NMStateStoreService mockNMStateStore;
+ private ConcurrentHashMap runningContainersMap;
+
+ @Before
+ public void setup() {
+ TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
+
+ mockCGroupsHandler = mock(CGroupsHandler.class);
+ mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
+ mockNMStateStore = mock(NMStateStoreService.class);
+
+ Context nmctx = mock(Context.class);
+ when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
+ runningContainersMap = new ConcurrentHashMap<>();
+ when(nmctx.getContainers()).thenReturn(runningContainersMap);
+
+ gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
+ mockPrivilegedExecutor);
+ }
+
+ @Test
+ public void testBootStrap() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
+
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ verify(mockCGroupsHandler, times(1)).initializeCGroupController(
+ CGroupsHandler.CGroupController.DEVICES);
+ }
+
+ private static ContainerId getContainerId(int id) {
+ return ContainerId.newContainerId(ApplicationAttemptId
+ .newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
+ }
+
+ private static Container mockContainerWithGpuRequest(int id, int numGpuRequest,
+ boolean dockerContainerEnabled) {
+ Container c = mock(Container.class);
+ when(c.getContainerId()).thenReturn(getContainerId(id));
+
+ Resource res = Resource.newInstance(1024, 1);
+ ResourceMappings resMapping = new ResourceMappings();
+
+ res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
+ when(c.getResource()).thenReturn(res);
+ when(c.getResourceMappings()).thenReturn(resMapping);
+
+ ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
+ Map env = new HashMap<>();
+ if (dockerContainerEnabled) {
+ env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE, "docker");
+ }
+ when(clc.getEnvironment()).thenReturn(env);
+ when(c.getLaunchContext()).thenReturn(clc);
+ return c;
+ }
+
+ private static Container mockContainerWithGpuRequest(int id,
+ int numGpuRequest) {
+ return mockContainerWithGpuRequest(id, numGpuRequest, false);
+ }
+
+ private void verifyDeniedDevices(ContainerId containerId,
+ List deniedDevices)
+ throws ResourceHandlerException, PrivilegedOperationException {
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, containerId.toString());
+
+ if (null != deniedDevices && !deniedDevices.isEmpty()) {
+ List deniedDevicesMinorNumber = new ArrayList<>();
+ for (GpuDevice deniedDevice : deniedDevices) {
+ deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
+ }
+ verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
+ new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
+ .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
+ containerId.toString(),
+ GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
+ StringUtils.join(",", deniedDevicesMinorNumber))), true);
+ }
+ }
+
+ private void commonTestAllocation(boolean dockerContainerEnabled)
+ throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(1, 3, dockerContainerEnabled));
+
+ // Only device=4 will be blocked.
+ if (dockerContainerEnabled) {
+ verifyDeniedDevices(getContainerId(1),
+ Collections.emptyList());
+ } else{
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4)));
+ }
+
+ /* Start container 2, asks 2 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(2, 2, dockerContainerEnabled));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Start container 3, ask 1 container, succeeded */
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(3, 1, dockerContainerEnabled));
+
+ // devices = 0/1/3 will be blocked
+ if (dockerContainerEnabled) {
+ verifyDeniedDevices(getContainerId(3),
+ Collections.emptyList());
+ } else {
+ verifyDeniedDevices(getContainerId(3), Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+ new GpuDevice(2, 3)));
+ }
+
+
+ /* Start container 4, ask 0 container, succeeded */
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(4, 0, dockerContainerEnabled));
+
+ if (dockerContainerEnabled) {
+ verifyDeniedDevices(getContainerId(4),
+ Collections.emptyList());
+ } else{
+ // All devices will be blocked
+ verifyDeniedDevices(getContainerId(4), Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
+ new GpuDevice(3, 4)));
+ }
+
+ /* Release container-1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(3,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Release container-3, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(3));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationWhenDockerContainerEnabled() throws Exception {
+ // When docker container is enabled, no devices should be written to
+ // devices.deny.
+ commonTestAllocation(true);
+ }
+
+ @Test
+ public void testAllocation() throws Exception {
+ commonTestAllocation(false);
+ }
+
+ @SuppressWarnings("unchecked")
+ @Test
+ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
+ throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ doThrow(new IOException("Exception ...")).when(mockNMStateStore)
+ .storeAssignedResources(
+ any(Container.class), anyString(), anyList());
+
+ boolean exception = false;
+ /* Start container 1, asks 3 containers */
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+ } catch (ResourceHandlerException e) {
+ exception = true;
+ }
+
+ Assert.assertTrue("preStart should throw exception", exception);
+
+ // After preStart, we still have 4 available GPU since the store op fails.
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationWithoutAllowedGpus() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ try {
+ gpuResourceHandler.bootstrap(conf);
+ Assert.fail("Should fail because no GPU available");
+ } catch (ResourceHandlerException e) {
+ // Expected because of no resource available
+ }
+
+ /* Start container 1, asks 0 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
+ verifyDeniedDevices(getContainerId(1), Collections.emptyList());
+
+ /* Start container 2, asks 1 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Release container 1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationStored() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ Container container = mockContainerWithGpuRequest(1, 3);
+ gpuResourceHandler.preStart(container);
+
+ verify(mockNMStateStore).storeAssignedResources(container,
+ ResourceInformation.GPU_URI, Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+ new GpuDevice(2, 3)));
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3, 4)));
+
+ /* Start container 2, ask 0 container, succeeded */
+ container = mockContainerWithGpuRequest(2, 0);
+ gpuResourceHandler.preStart(container);
+
+ verifyDeniedDevices(getContainerId(2), Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
+ new GpuDevice(3, 4)));
+ Assert.assertEquals(0, container.getResourceMappings()
+ .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+ // Store assigned resource will not be invoked.
+ verify(mockNMStateStore, never()).storeAssignedResources(
+ eq(container), eq(ResourceInformation.GPU_URI),
+ anyListOf(Serializable.class));
+ }
+
+ @Test
+ public void testAllocationStoredWithNULLStateStore() throws Exception {
+ NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class);
+
+ Context nmnctx = mock(Context.class);
+ when(nmnctx.getNMStateStore()).thenReturn(mockNMNULLStateStore);
+
+ GpuResourceHandlerImpl gpuNULLStateResourceHandler =
+ new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
+ mockPrivilegedExecutor);
+
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuNULLStateResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuNULLStateResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ Container container = mockContainerWithGpuRequest(1, 3);
+ gpuNULLStateResourceHandler.preStart(container);
+
+ verify(nmnctx.getNMStateStore()).storeAssignedResources(container,
+ ResourceInformation.GPU_URI, Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+ new GpuDevice(2, 3)));
+ }
+
+ @Test
+ public void testRecoverResourceAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ Container nmContainer = mock(Container.class);
+ ResourceMappings rmap = new ResourceMappings();
+ ResourceMappings.AssignedResources ar =
+ new ResourceMappings.AssignedResources();
+ ar.updateAssignedResources(
+ Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3)));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(1), nmContainer);
+
+ // TEST CASE
+ // Reacquire container restore state of GPU Resource Allocator.
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+
+ Map deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+ getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is not in allowed list.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=5 is not in allowed list.
+ ar.updateAssignedResources(
+ Arrays.asList(new GpuDevice(3, 4), new GpuDevice(4, 5)));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ boolean caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(deviceAllocationMapping.keySet()
+ .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
+ Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+ getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is already assigned.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=3 is already assigned
+ ar.updateAssignedResources(
+ Arrays.asList(new GpuDevice(3, 4), new GpuDevice(2, 3)));
+ rmap.addAssignedResources("gpu", ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(deviceAllocationMapping.keySet()
+ .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
+ Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+ getContainerId(1));
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
index 318ae6bb73a..a147afb881c 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
@@ -70,7 +70,7 @@
private static class MockExecutor extends ContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
}
@Override
public void startLocalizer(LocalizerStartContext ctx)
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
new file mode 100644
index 00000000000..bcadf76e4bd
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
@@ -0,0 +1,261 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
+import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestResourcePluginManager extends NodeManagerTestBase {
+ private NodeManager nm;
+
+ ResourcePluginManager stubResourcePluginmanager() {
+ // Stub ResourcePluginManager
+ final ResourcePluginManager rpm = mock(ResourcePluginManager.class);
+ Map plugins = new HashMap<>();
+
+ // First resource plugin
+ ResourcePlugin resourcePlugin = mock(ResourcePlugin.class);
+ NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock(
+ NodeResourceUpdaterPlugin.class);
+ when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn(
+ nodeResourceUpdaterPlugin);
+ plugins.put("resource1", resourcePlugin);
+
+ // Second resource plugin
+ resourcePlugin = mock(ResourcePlugin.class);
+ when(resourcePlugin.createResourceHandler(any(Context.class), any(
+ CGroupsHandler.class), any(PrivilegedOperationExecutor.class)))
+ .thenReturn(new CustomizedResourceHandler());
+ plugins.put("resource2", resourcePlugin);
+ when(rpm.getNameToPlugins()).thenReturn(plugins);
+ return rpm;
+ }
+
+ @After
+ public void tearDown() {
+ if (nm != null) {
+ try {
+ ServiceOperations.stop(nm);
+ } catch (Throwable t) {
+ // ignore
+ }
+ }
+ }
+
+ private class CustomizedResourceHandler implements ResourceHandler {
+
+ @Override
+ public List bootstrap(Configuration configuration)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List preStart(Container container)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List reacquireContainer(ContainerId containerId)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List postComplete(ContainerId containerId)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List teardown()
+ throws ResourceHandlerException {
+ return null;
+ }
+ }
+
+ private class MyMockNM extends NodeManager {
+ private final ResourcePluginManager rpm;
+
+ public MyMockNM(ResourcePluginManager rpm) {
+ this.rpm = rpm;
+ }
+
+ @Override
+ protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+ ((NodeManager.NMContext)context).setResourcePluginManager(rpm);
+ return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+ metrics, new BaseResourceTrackerForTest());
+ }
+
+ @Override
+ protected ContainerManagerImpl createContainerManager(Context context,
+ ContainerExecutor exec, DeletionService del,
+ NodeStatusUpdater nodeStatusUpdater,
+ ApplicationACLsManager aclsManager,
+ LocalDirsHandlerService diskhandler) {
+ return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+ metrics, diskhandler);
+ }
+
+ @Override
+ protected ResourcePluginManager createResourcePluginManager() {
+ return rpm;
+ }
+ }
+
+ public class MyLCE extends LinuxContainerExecutor {
+ private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class);
+
+ @Override
+ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
+ return poe;
+ }
+ }
+
+ /*
+ * Make sure ResourcePluginManager is initialized during NM start up.
+ */
+ @Test(timeout = 30000)
+ public void testResourcePluginManagerInitialization() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+ nm = new MyMockNM(rpm);
+
+ YarnConfiguration conf = createNMConfig();
+ nm.init(conf);
+ verify(rpm, times(1)).initialize(
+ any(Context.class));
+ }
+
+ /*
+ * Make sure ResourcePluginManager is invoked during NM update.
+ */
+ @Test(timeout = 30000)
+ public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+
+ nm = new MyMockNM(rpm);
+
+ YarnConfiguration conf = createNMConfig();
+ nm.init(conf);
+ nm.start();
+
+ NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin =
+ rpm.getNameToPlugins().get("resource1")
+ .getNodeResourceHandlerInstance();
+
+ verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource(
+ any(Resource.class));
+ }
+
+ /*
+ * Make sure ResourcePluginManager is used to initialize ResourceHandlerChain
+ */
+ @Test(timeout = 30000)
+ public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+ final LinuxContainerExecutor lce = new MyLCE();
+
+ nm = new NodeManager() {
+ @Override
+ protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+ ((NMContext)context).setResourcePluginManager(rpm);
+ return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+ metrics, new BaseResourceTrackerForTest());
+ }
+
+ @Override
+ protected ContainerManagerImpl createContainerManager(Context context,
+ ContainerExecutor exec, DeletionService del,
+ NodeStatusUpdater nodeStatusUpdater,
+ ApplicationACLsManager aclsManager,
+ LocalDirsHandlerService diskhandler) {
+ return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+ metrics, diskhandler);
+ }
+
+ @Override
+ protected ContainerExecutor createContainerExecutor(Configuration conf) {
+ ((NMContext)this.getNMContext()).setResourcePluginManager(rpm);
+ lce.setConf(conf);
+ return lce;
+ }
+ };
+
+ YarnConfiguration conf = createNMConfig();
+
+ nm.init(conf);
+ nm.start();
+
+ ResourceHandler handler = lce.getResourceHandler();
+ Assert.assertNotNull(handler);
+ Assert.assertTrue(handler instanceof ResourceHandlerChain);
+
+ boolean newHandlerAdded = false;
+ for (ResourceHandler h : ((ResourceHandlerChain) handler)
+ .getResourceHandlerList()) {
+ if (h instanceof CustomizedResourceHandler) {
+ newHandlerAdded = true;
+ break;
+ }
+ }
+ Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded);
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
new file mode 100644
index 00000000000..4abb633a69a
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+public class TestGpuDiscoverer {
+ private String getTestParentFolder() {
+ File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
+ return f.getAbsolutePath();
+ }
+
+ private void touchFile(File f) throws IOException {
+ new FileOutputStream(f).close();
+ }
+
+ @Before
+ public void before() throws IOException {
+ String folder = getTestParentFolder();
+ File f = new File(folder);
+ FileUtils.deleteDirectory(f);
+ f.mkdirs();
+ }
+
+ @Test
+ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
+ // Only run this on demand.
+ Assume.assumeTrue(Boolean.valueOf(
+ System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest")));
+
+ // test case 1, check default setting.
+ Configuration conf = new Configuration(false);
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+ Assert.assertTrue(
+ plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+
+ // test case 2, check mandatory set path.
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ // test case 3, check mandatory set path, but binary doesn't exist so default
+ // path will be used.
+ fakeBinary.delete();
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ Assert.assertTrue(
+ plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+ }
+
+ @Test
+ public void testGpuDiscover() throws YarnException {
+ // Since this is more of a performance unit test, only run if
+ // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
+ Assume.assumeTrue(
+ Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
+ Configuration conf = new Configuration(false);
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ GpuDeviceInformation info = plugin.getGpuDeviceInformation();
+
+ Assert.assertTrue(info.getGpus().size() > 0);
+ Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
+ info.getGpus().size());
+ }
+
+ @Test
+ public void getNumberOfUsableGpusFromConfig() throws YarnException {
+ Configuration conf = new Configuration(false);
+
+ // Illegal format
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3");
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ try {
+ plugin.initialize(conf);
+ plugin.getGpusUsableByYarn();
+ Assert.fail("Illegal format, should fail.");
+ } catch (YarnException e) {
+ // Expected
+ }
+
+ // Valid format
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4");
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+
+ List usableGpuDevices = plugin.getGpusUsableByYarn();
+ Assert.assertEquals(4, usableGpuDevices.size());
+
+ Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
+ Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
+ Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
+ Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
+
+ Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
+ Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
+ Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
+ Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
index 0e46234a91f..4364709b56f 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.nodemanager.recovery;
import java.io.IOException;
+import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -42,6 +43,8 @@
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
@@ -124,6 +127,7 @@ public synchronized void removeApplication(ApplicationId appId)
rcsCopy.setRemainingRetryAttempts(rcs.getRemainingRetryAttempts());
rcsCopy.setWorkDir(rcs.getWorkDir());
rcsCopy.setLogDir(rcs.getLogDir());
+ rcsCopy.setResourceMappings(rcs.getResourceMappings());
result.add(rcsCopy);
}
return result;
@@ -511,6 +515,20 @@ public synchronized void removeAMRMProxyAppContext(
amrmProxyState.getAppContexts().remove(attempt);
}
+ @Override
+ public void storeAssignedResources(Container container,
+ String resourceType, List assignedResources)
+ throws IOException {
+ ResourceMappings.AssignedResources ar =
+ new ResourceMappings.AssignedResources();
+ ar.updateAssignedResources(assignedResources);
+ containerStates.get(container.getContainerId()).getResourceMappings()
+ .addAssignedResources(resourceType, ar);
+
+ // update container resource mapping.
+ updateContainerResourceMapping(container, resourceType, assignedResources);
+ }
+
private static class TrackerState {
Map inProgressMap =
new HashMap();
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index a5079382c1d..20c5240c30b 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -29,9 +29,11 @@
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.timeout;
import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
import java.io.File;
import java.io.IOException;
+import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
@@ -68,6 +70,8 @@
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.AMRMProxyTokenSecretManager;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.LocalResourceTrackerState;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState;
@@ -1003,46 +1007,12 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException {
.loadContainersState();
assertTrue(recoveredContainers.isEmpty());
- // create a container request
ApplicationId appId = ApplicationId.newInstance(1234, 3);
ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId,
4);
ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
- LocalResource lrsrc = LocalResource.newInstance(
- URL.newInstance("hdfs", "somehost", 12345, "/some/path/to/rsrc"),
- LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, 123L,
- 1234567890L);
- Map localResources =
- new HashMap();
- localResources.put("rsrc", lrsrc);
- Map env = new HashMap();
- env.put("somevar", "someval");
- List containerCmds = new ArrayList();
- containerCmds.add("somecmd");
- containerCmds.add("somearg");
- Map serviceData = new HashMap();
- serviceData.put("someservice",
- ByteBuffer.wrap(new byte[] { 0x1, 0x2, 0x3 }));
- ByteBuffer containerTokens = ByteBuffer
- .wrap(new byte[] { 0x7, 0x8, 0x9, 0xa });
- Map acls =
- new HashMap();
- acls.put(ApplicationAccessType.VIEW_APP, "viewuser");
- acls.put(ApplicationAccessType.MODIFY_APP, "moduser");
- ContainerLaunchContext clc = ContainerLaunchContext.newInstance(
- localResources, env, containerCmds,
- serviceData, containerTokens, acls);
- Resource containerRsrc = Resource.newInstance(1357, 3);
- ContainerTokenIdentifier containerTokenId = new ContainerTokenIdentifier(
- containerId, "host", "user", containerRsrc, 9876543210L, 42, 2468,
- Priority.newInstance(7), 13579);
- Token containerToken = Token.newInstance(containerTokenId.getBytes(),
- ContainerTokenIdentifier.KIND.toString(), "password".getBytes(),
- "tokenservice");
- StartContainerRequest containerReq = StartContainerRequest.newInstance(clc,
- containerToken);
-
- stateStore.storeContainer(containerId, 0, 0, containerReq);
+ StartContainerRequest startContainerRequest = storeMockContainer(
+ containerId);
// add a invalid key
byte[] invalidKey = ("ContainerManager/containers/"
@@ -1055,7 +1025,7 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException {
assertEquals(RecoveredContainerStatus.REQUESTED, rcs.getStatus());
assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
assertEquals(false, rcs.getKilled());
- assertEquals(containerReq, rcs.getStartRequest());
+ assertEquals(startContainerRequest, rcs.getStartRequest());
assertTrue(rcs.getDiagnostics().isEmpty());
assertEquals(RecoveredContainerType.KILL, rcs.getRecoveryType());
// assert unknown keys are cleaned up finally
@@ -1163,6 +1133,98 @@ public void testAMRMProxyStorage() throws IOException {
}
}
+ @Test
+ public void testStateStoreForResourceMapping() throws IOException {
+ // test empty when no state
+ List recoveredContainers = stateStore
+ .loadContainersState();
+ assertTrue(recoveredContainers.isEmpty());
+
+ ApplicationId appId = ApplicationId.newInstance(1234, 3);
+ ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId,
+ 4);
+ ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
+ storeMockContainer(containerId);
+
+ Container container = mock(Container.class);
+ when(container.getContainerId()).thenReturn(containerId);
+ ResourceMappings resourceMappings = new ResourceMappings();
+ when(container.getResourceMappings()).thenReturn(resourceMappings);
+
+ // Store ResourceMapping
+ stateStore.storeAssignedResources(container, "gpu",
+ Arrays.asList("1", "2", "3"));
+ // This will overwrite above
+ List gpuRes1 = Arrays.asList("1", "2", "4");
+ stateStore.storeAssignedResources(container, "gpu", gpuRes1);
+ List fpgaRes =
+ Arrays.asList("3", "4", "5", "6");
+ stateStore.storeAssignedResources(container, "fpga", fpgaRes);
+ List numaRes = Arrays.asList("numa1");
+ stateStore.storeAssignedResources(container, "numa", numaRes);
+
+ // add a invalid key
+ restartStateStore();
+ recoveredContainers = stateStore.loadContainersState();
+ assertEquals(1, recoveredContainers.size());
+ RecoveredContainerState rcs = recoveredContainers.get(0);
+ List res = rcs.getResourceMappings()
+ .getAssignedResources("gpu");
+ Assert.assertTrue(res.equals(gpuRes1));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("gpu").equals(gpuRes1));
+
+ res = rcs.getResourceMappings().getAssignedResources("fpga");
+ Assert.assertTrue(res.equals(fpgaRes));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("fpga").equals(fpgaRes));
+
+ res = rcs.getResourceMappings().getAssignedResources("numa");
+ Assert.assertTrue(res.equals(numaRes));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("numa").equals(numaRes));
+ }
+
+ private StartContainerRequest storeMockContainer(ContainerId containerId)
+ throws IOException {
+ // create a container request
+ LocalResource lrsrc = LocalResource.newInstance(
+ URL.newInstance("hdfs", "somehost", 12345, "/some/path/to/rsrc"),
+ LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, 123L,
+ 1234567890L);
+ Map localResources =
+ new HashMap();
+ localResources.put("rsrc", lrsrc);
+ Map env = new HashMap();
+ env.put("somevar", "someval");
+ List containerCmds = new ArrayList();
+ containerCmds.add("somecmd");
+ containerCmds.add("somearg");
+ Map serviceData = new HashMap();
+ serviceData.put("someservice",
+ ByteBuffer.wrap(new byte[] { 0x1, 0x2, 0x3 }));
+ ByteBuffer containerTokens = ByteBuffer
+ .wrap(new byte[] { 0x7, 0x8, 0x9, 0xa });
+ Map acls =
+ new HashMap();
+ acls.put(ApplicationAccessType.VIEW_APP, "viewuser");
+ acls.put(ApplicationAccessType.MODIFY_APP, "moduser");
+ ContainerLaunchContext clc = ContainerLaunchContext.newInstance(
+ localResources, env, containerCmds,
+ serviceData, containerTokens, acls);
+ Resource containerRsrc = Resource.newInstance(1357, 3);
+ ContainerTokenIdentifier containerTokenId = new ContainerTokenIdentifier(
+ containerId, "host", "user", containerRsrc, 9876543210L, 42, 2468,
+ Priority.newInstance(7), 13579);
+ Token containerToken = Token.newInstance(containerTokenId.getBytes(),
+ ContainerTokenIdentifier.KIND.toString(), "password".getBytes(),
+ "tokenservice");
+ StartContainerRequest containerReq = StartContainerRequest.newInstance(clc,
+ containerToken);
+ stateStore.storeContainer(containerId, 0, 0, containerReq);
+ return containerReq;
+ }
+
private static class NMTokenSecretManagerForTest extends
BaseNMTokenSecretManager {
public MasterKey generateKey() {
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestNodeManagerHardwareUtils.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestNodeManagerHardwareUtils.java
index 4add586bbf1..767c308aeb6 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestNodeManagerHardwareUtils.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestNodeManagerHardwareUtils.java
@@ -172,7 +172,7 @@ public void testGetContainerMemoryMB() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.NM_ENABLE_HARDWARE_CAPABILITY_DETECTION,
true);
- int mem = NodeManagerHardwareUtils.getContainerMemoryMB(null, conf);
+ long mem = NodeManagerHardwareUtils.getContainerMemoryMB(null, conf);
Assert.assertEquals(YarnConfiguration.DEFAULT_NM_PMEM_MB, mem);
mem = NodeManagerHardwareUtils.getContainerMemoryMB(plugin, conf);
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
index b9c6fffd6b2..29c20382ea8 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
@@ -37,6 +37,7 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceSet;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
@@ -242,4 +243,9 @@ public void sendPauseEvent(String description) {
public long getContainerStartTime() {
return 0;
}
+
+ @Override
+ public ResourceMappings getResourceMappings() {
+ return null;
+ }
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java
index 4586a7b88c4..980eae95841 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java
@@ -18,25 +18,20 @@
package org.apache.hadoop.yarn.server.nodemanager.webapp;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.StringReader;
-import java.net.HttpURLConnection;
-import java.net.URI;
-import java.net.URL;
-import java.util.List;
-import javax.servlet.http.HttpServletResponse;
-import javax.ws.rs.core.MediaType;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-
-import org.junit.Assert;
+import com.google.inject.Guice;
+import com.google.inject.Injector;
+import com.google.inject.servlet.GuiceServletContextListener;
+import com.google.inject.servlet.ServletModule;
+import com.sun.jersey.api.client.ClientResponse;
+import com.sun.jersey.api.client.ClientResponse.Status;
+import com.sun.jersey.api.client.GenericType;
+import com.sun.jersey.api.client.UniformInterfaceException;
+import com.sun.jersey.api.client.WebResource;
+import com.sun.jersey.guice.spi.container.servlet.GuiceContainer;
+import com.sun.jersey.test.framework.WebAppDescriptor;
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlRootElement;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -48,6 +43,7 @@
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
+import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.logaggregation.ContainerLogAggregationType;
import org.apache.hadoop.yarn.logaggregation.ContainerLogFileInfo;
import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils;
@@ -59,7 +55,15 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.server.webapp.YarnWebServiceParams;
@@ -73,6 +77,7 @@
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import org.junit.AfterClass;
+import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.w3c.dom.Document;
@@ -80,24 +85,35 @@
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
-import com.google.inject.Guice;
-import com.google.inject.Injector;
-import com.google.inject.servlet.GuiceServletContextListener;
-import com.google.inject.servlet.ServletModule;
-import com.sun.jersey.api.client.ClientResponse;
-import com.sun.jersey.api.client.ClientResponse.Status;
-import com.sun.jersey.api.client.GenericType;
-import com.sun.jersey.api.client.UniformInterfaceException;
-import com.sun.jersey.api.client.WebResource;
-import com.sun.jersey.guice.spi.container.servlet.GuiceContainer;
-import com.sun.jersey.test.framework.WebAppDescriptor;
+import javax.servlet.http.HttpServletResponse;
+import javax.ws.rs.core.MediaType;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringReader;
+import java.net.HttpURLConnection;
+import java.net.URI;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
/**
* Test the nodemanager node info web services api's
*/
public class TestNMWebServices extends JerseyTestBase {
- private static Context nmContext;
+ private static NodeManager.NMContext nmContext;
private static ResourceView resourceView;
private static ApplicationACLsManager aclsManager;
private static LocalDirsHandlerService dirsHandler;
@@ -411,6 +427,112 @@ public void testNMRedirect() {
assertFalse(redirectURL.contains(YarnWebServiceParams.NM_ID));
}
+ @XmlRootElement
+ @XmlAccessorType(XmlAccessType.FIELD)
+ private static class MockNMResourceInfo extends NMResourceInfo {
+ public long a = 1000L;
+ public MockNMResourceInfo() { }
+ }
+
+ @Test
+ public void testGetNMResourceInfo()
+ throws YarnException, InterruptedException, JSONException {
+ ResourcePluginManager rpm = mock(ResourcePluginManager.class);
+ Map namesToPlugins = new HashMap<>();
+ ResourcePlugin mockPlugin1 = mock(ResourcePlugin.class);
+ NMResourceInfo nmResourceInfo1 = new MockNMResourceInfo();
+ when(mockPlugin1.getNMResourceInfo()).thenReturn(nmResourceInfo1);
+ namesToPlugins.put("resource-1", mockPlugin1);
+ namesToPlugins.put("yarn.io/resource-1", mockPlugin1);
+ ResourcePlugin mockPlugin2 = mock(ResourcePlugin.class);
+ namesToPlugins.put("resource-2", mockPlugin2);
+ when(rpm.getNameToPlugins()).thenReturn(namesToPlugins);
+
+ nmContext.setResourcePluginManager(rpm);
+
+ WebResource r = resource();
+ ClientResponse response = r.path("ws").path("v1").path("node").path(
+ "resources").path("resource-2").accept(MediaType.APPLICATION_JSON).get(
+ ClientResponse.class);
+ assertEquals(MediaType.APPLICATION_JSON, response.getType().toString());
+
+ // Access resource-2 should fail (empty NMResourceInfo returned).
+ String resp = response.getEntity(String.class);
+ assertEquals("null", resp);
+
+ // Access resource-3 should fail (unknown plugin)
+ response = r.path("ws").path("v1").path("node").path(
+ "resources").path("resource-3").accept(MediaType.APPLICATION_JSON).get(
+ ClientResponse.class);
+ assertEquals(MediaType.APPLICATION_JSON, response.getType().toString());
+ resp = response.getEntity(String.class);
+ assertEquals("null", resp);
+
+ // Access resource-1 should success
+ response = r.path("ws").path("v1").path("node").path(
+ "resources").path("resource-1").accept(MediaType.APPLICATION_JSON).get(
+ ClientResponse.class);
+ assertEquals(MediaType.APPLICATION_JSON, response.getType().toString());
+ JSONObject json = response.getEntity(JSONObject.class);
+ assertEquals(1000, Long.parseLong(json.get("a").toString()));
+
+ // Access resource-1 should success (encoded yarn.io/Fresource-1).
+ response = r.path("ws").path("v1").path("node").path("resources").path(
+ "yarn.io%2Fresource-1").accept(MediaType.APPLICATION_JSON).get(
+ ClientResponse.class);
+ assertEquals(MediaType.APPLICATION_JSON, response.getType().toString());
+ json = response.getEntity(JSONObject.class);
+ assertEquals(1000, Long.parseLong(json.get("a").toString()));
+ }
+
+ private ContainerId createContainerId(int id) {
+ ApplicationId appId = ApplicationId.newInstance(0, 0);
+ ApplicationAttemptId appAttemptId =
+ ApplicationAttemptId.newInstance(appId, 1);
+ ContainerId containerId = ContainerId.newContainerId(appAttemptId, id);
+ return containerId;
+ }
+
+ @Test
+ public void testGetYarnGpuResourceInfo()
+ throws YarnException, InterruptedException, JSONException {
+ ResourcePluginManager rpm = mock(ResourcePluginManager.class);
+ Map namesToPlugins = new HashMap<>();
+ ResourcePlugin mockPlugin1 = mock(ResourcePlugin.class);
+ GpuDeviceInformation gpuDeviceInformation = new GpuDeviceInformation();
+ gpuDeviceInformation.setDriverVersion("1.2.3");
+ gpuDeviceInformation.setGpus(Arrays.asList(new PerGpuDeviceInformation()));
+ NMResourceInfo nmResourceInfo1 = new NMGpuResourceInfo(gpuDeviceInformation,
+ Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 2),
+ new GpuDevice(3, 3)), Arrays
+ .asList(new AssignedGpuDevice(2, 2, createContainerId(1)),
+ new AssignedGpuDevice(3, 3, createContainerId(2))));
+ when(mockPlugin1.getNMResourceInfo()).thenReturn(nmResourceInfo1);
+ namesToPlugins.put("resource-1", mockPlugin1);
+ namesToPlugins.put("yarn.io/resource-1", mockPlugin1);
+ ResourcePlugin mockPlugin2 = mock(ResourcePlugin.class);
+ namesToPlugins.put("resource-2", mockPlugin2);
+ when(rpm.getNameToPlugins()).thenReturn(namesToPlugins);
+
+ nmContext.setResourcePluginManager(rpm);
+
+ WebResource r = resource();
+ ClientResponse response;
+ JSONObject json;
+
+ // Access resource-1 should success
+ response = r.path("ws").path("v1").path("node").path(
+ "resources").path("resource-1").accept(MediaType.APPLICATION_JSON).get(
+ ClientResponse.class);
+ assertEquals(MediaType.APPLICATION_JSON, response.getType().toString());
+ json = response.getEntity(JSONObject.class);
+ assertEquals("1.2.3",
+ json.getJSONObject("gpuDeviceInformation").get("driver_version"));
+ assertEquals(3, json.getJSONArray("totalGpuDevices").length());
+ assertEquals(2, json.getJSONArray("assignedGpuDevices").length());
+ assertEquals(2, json.getJSONArray("assignedGpuDevices").length());
+ }
+
private void testContainerLogs(WebResource r, ContainerId containerId)
throws IOException {
final String containerIdStr = containerId.toString();
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java
new file mode 100644
index 00000000000..dc96746cf5d
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+public class TestGpuDeviceInformationParser {
+ @Test
+ public void testParse() throws IOException, YarnException {
+ File f = new File("src/test/resources/nvidia-smi-sample-xml-output");
+ String s = FileUtils.readFileToString(f, "UTF-8");
+
+ GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+ GpuDeviceInformation info = parser.parseXml(s);
+ Assert.assertEquals("375.66", info.getDriverVersion());
+ Assert.assertEquals(2, info.getGpus().size());
+ PerGpuDeviceInformation gpu1 = info.getGpus().get(1);
+ Assert.assertEquals("Tesla P100-PCIE-12GB", gpu1.getProductName());
+ Assert.assertEquals(12193, gpu1.getGpuMemoryUsage().getTotalMemoryMiB());
+ Assert.assertEquals(10.3f,
+ gpu1.getGpuUtilizations().getOverallGpuUtilization(), 1e-6);
+ Assert.assertEquals(34f, gpu1.getTemperature().getCurrentGpuTemp(), 1e-6);
+ Assert.assertEquals(85f, gpu1.getTemperature().getMaxGpuTemp(), 1e-6);
+ Assert.assertEquals(82f, gpu1.getTemperature().getSlowThresholdGpuTemp(),
+ 1e-6);
+ }
+}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output
new file mode 100644
index 00000000000..5ccb72265b5
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output
@@ -0,0 +1,547 @@
+
+
+
+
+
+
+ Wed Sep 6 21:52:51 2017
+ 375.66
+ 2
+
+ Tesla P100-PCIE-12GB
+ Tesla
+ Disabled
+ Disabled
+ Disabled
+ Disabled
+ 1920
+
+ N/A
+ N/A
+
+ 0320717030197
+ GPU-28604e81-21ec-cc48-6759-bf2648b22e16
+ 0
+ 86.00.3A.00.02
+ No
+ 0x400
+ 900-2H400-0110-030
+
+ H400.0202.00.01
+ 1.1
+ 4.1
+ N/A
+
+
+ N/A
+ N/A
+
+
+ None
+
+
+ 04
+ 00
+ 0000
+ 15F710DE
+ 0000:04:00.0
+ 11DA10DE
+
+
+ 3
+ 3
+
+
+ 16x
+ 16x
+
+
+
+ N/A
+ N/A
+
+ 0
+ 0 KB/s
+ 0 KB/s
+
+ N/A
+ P0
+
+ Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+
+
+ 12193 MiB
+ 0 MiB
+ 12193 MiB
+
+
+ 16384 MiB
+ 2 MiB
+ 16382 MiB
+
+ Default
+
+ 0 %
+ 0 %
+ 0 %
+ 0 %
+
+
+ 0
+ 0
+ 0 ms
+
+
+ Enabled
+ Enabled
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+
+ 0
+
+
+
+
+ 0
+
+
+
+ No
+
+
+ 31 C
+ 85 C
+ 82 C
+
+
+ P0
+ Supported
+ 24.84 W
+ 250.00 W
+ 250.00 W
+ 250.00 W
+ 125.00 W
+ 250.00 W
+
+
+ 405 MHz
+ 405 MHz
+ 715 MHz
+ 835 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1328 MHz
+ 1328 MHz
+ 715 MHz
+ 1328 MHz
+
+
+ N/A
+ N/A
+
+
+
+ 715 MHz
+ 1328 MHz
+ 1316 MHz
+ 1303 MHz
+ 1290 MHz
+ 1278 MHz
+ 1265 MHz
+ 1252 MHz
+ 1240 MHz
+ 1227 MHz
+ 1215 MHz
+ 1202 MHz
+ 1189 MHz
+ 1177 MHz
+ 1164 MHz
+ 1151 MHz
+ 1139 MHz
+ 1126 MHz
+ 1113 MHz
+ 1101 MHz
+ 1088 MHz
+ 1075 MHz
+ 1063 MHz
+ 1050 MHz
+ 1037 MHz
+ 1025 MHz
+ 1012 MHz
+ 999 MHz
+ 987 MHz
+ 974 MHz
+ 961 MHz
+ 949 MHz
+ 936 MHz
+ 923 MHz
+ 911 MHz
+ 898 MHz
+ 885 MHz
+ 873 MHz
+ 860 MHz
+ 847 MHz
+ 835 MHz
+ 822 MHz
+ 810 MHz
+ 797 MHz
+ 784 MHz
+ 772 MHz
+ 759 MHz
+ 746 MHz
+ 734 MHz
+ 721 MHz
+ 708 MHz
+ 696 MHz
+ 683 MHz
+ 670 MHz
+ 658 MHz
+ 645 MHz
+ 632 MHz
+ 620 MHz
+ 607 MHz
+ 594 MHz
+ 582 MHz
+ 569 MHz
+ 556 MHz
+ 544 MHz
+
+
+
+
+
+
+
+
+
+ Tesla P100-PCIE-12GB
+ Tesla
+ Disabled
+ Disabled
+ Disabled
+ Disabled
+ 1920
+
+ N/A
+ N/A
+
+ 0320717031755
+ GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3
+ 1
+ 86.00.3A.00.02
+ No
+ 0x8200
+ 900-2H400-0110-030
+
+ H400.0202.00.01
+ 1.1
+ 4.1
+ N/A
+
+
+ N/A
+ N/A
+
+
+ None
+
+
+ 82
+ 00
+ 0000
+ 15F710DE
+ 0000:82:00.0
+ 11DA10DE
+
+
+ 3
+ 3
+
+
+ 16x
+ 16x
+
+
+
+ N/A
+ N/A
+
+ 0
+ 0 KB/s
+ 0 KB/s
+
+ N/A
+ P0
+
+ Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+
+
+ 12193 MiB
+ 0 MiB
+ 12193 MiB
+
+
+ 16384 MiB
+ 2 MiB
+ 16382 MiB
+
+ Default
+
+ 10.3 %
+ 0 %
+ 0 %
+ 0 %
+
+
+ 0
+ 0
+ 0 ms
+
+
+ Enabled
+ Enabled
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ N/A
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+
+ 0
+
+
+
+
+ 0
+
+
+
+ No
+
+
+ 34 C
+ 85 C
+ 82 C
+
+
+ P0
+ Supported
+ 25.54 W
+ 250.00 W
+ 250.00 W
+ 250.00 W
+ 125.00 W
+ 250.00 W
+
+
+ 405 MHz
+ 405 MHz
+ 715 MHz
+ 835 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1189 MHz
+ 715 MHz
+
+
+ 1328 MHz
+ 1328 MHz
+ 715 MHz
+ 1328 MHz
+
+
+ N/A
+ N/A
+
+
+
+ 715 MHz
+ 1328 MHz
+ 1316 MHz
+ 1303 MHz
+ 1290 MHz
+ 1278 MHz
+ 1265 MHz
+ 1252 MHz
+ 1240 MHz
+ 1227 MHz
+ 1215 MHz
+ 1202 MHz
+ 1189 MHz
+ 1177 MHz
+ 1164 MHz
+ 1151 MHz
+ 1139 MHz
+ 1126 MHz
+ 1113 MHz
+ 1101 MHz
+ 1088 MHz
+ 1075 MHz
+ 1063 MHz
+ 1050 MHz
+ 1037 MHz
+ 1025 MHz
+ 1012 MHz
+ 999 MHz
+ 987 MHz
+ 974 MHz
+ 961 MHz
+ 949 MHz
+ 936 MHz
+ 923 MHz
+ 911 MHz
+ 898 MHz
+ 885 MHz
+ 873 MHz
+ 860 MHz
+ 847 MHz
+ 835 MHz
+ 822 MHz
+ 810 MHz
+ 797 MHz
+ 784 MHz
+ 772 MHz
+ 759 MHz
+ 746 MHz
+ 734 MHz
+ 721 MHz
+ 708 MHz
+ 696 MHz
+ 683 MHz
+ 670 MHz
+ 658 MHz
+ 645 MHz
+ 632 MHz
+ 620 MHz
+ 607 MHz
+ 594 MHz
+ 582 MHz
+ 569 MHz
+ 556 MHz
+ 544 MHz
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AMSProcessingChain.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AMSProcessingChain.java
index 931b1c8b7d5..7ae23e7bb63 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AMSProcessingChain.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AMSProcessingChain.java
@@ -82,7 +82,7 @@ public synchronized void addProcessor(
public void registerApplicationMaster(
ApplicationAttemptId applicationAttemptId,
RegisterApplicationMasterRequest request,
- RegisterApplicationMasterResponse resp) throws IOException {
+ RegisterApplicationMasterResponse resp) throws IOException, YarnException {
this.head.registerApplicationMaster(applicationAttemptId, request, resp);
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java
index 6c0a8541223..3c117bc4b07 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java
@@ -400,14 +400,32 @@ public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request)
}
}
+ protected Configuration loadNewConfiguration()
+ throws IOException, YarnException {
+ // Retrieve yarn-site.xml in order to refresh scheduling monitor properties.
+ Configuration conf = getConfiguration(new Configuration(false),
+ YarnConfiguration.YARN_SITE_CONFIGURATION_FILE,
+ YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE);
+ // The reason we call Configuration#size() is because when getConfiguration
+ // been called, it invokes Configuration#addResouce, which invokes
+ // Configuration#reloadConfiguration which triggers the reload process in a
+ // lazy way, the properties will only be reload when it's needed rather than
+ // reload it right after getConfiguration been called. So here we call
+ // Configuration#size() to force the Configuration#getProps been called to
+ // reload all the properties.
+ conf.size();
+ return conf;
+ }
+
@Private
public void refreshQueues() throws IOException, YarnException {
- rm.getRMContext().getScheduler().reinitialize(getConfig(),
+ Configuration conf = loadNewConfiguration();
+ rm.getRMContext().getScheduler().reinitialize(conf,
this.rm.getRMContext());
// refresh the reservation system
ReservationSystem rSystem = rm.getRMContext().getReservationSystem();
if (rSystem != null) {
- rSystem.reinitialize(getConfig(), rm.getRMContext());
+ rSystem.reinitialize(conf, rm.getRMContext());
}
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java
index a10413ba143..16d6416f89f 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java
@@ -112,6 +112,8 @@
import org.apache.hadoop.yarn.api.protocolrecords.UpdateApplicationPriorityResponse;
import org.apache.hadoop.yarn.api.protocolrecords.UpdateApplicationTimeoutsRequest;
import org.apache.hadoop.yarn.api.protocolrecords.UpdateApplicationTimeoutsResponse;
+import org.apache.hadoop.yarn.api.protocolrecords.GetAllResourceTypeInfoRequest;
+import org.apache.hadoop.yarn.api.protocolrecords.GetAllResourceTypeInfoResponse;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptReport;
@@ -174,6 +176,7 @@
import org.apache.hadoop.yarn.util.Clock;
import org.apache.hadoop.yarn.util.Records;
import org.apache.hadoop.yarn.util.UTCClock;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.timeline.TimelineUtils;
import com.google.common.annotations.VisibleForTesting;
@@ -1783,4 +1786,12 @@ public void setDisplayPerUserApps(boolean displayPerUserApps) {
this.displayPerUserApps = displayPerUserApps;
}
+ @Override
+ public GetAllResourceTypeInfoResponse getResourceTypeInfo(
+ GetAllResourceTypeInfoRequest request) throws YarnException, IOException {
+ GetAllResourceTypeInfoResponse response =
+ GetAllResourceTypeInfoResponse.newInstance();
+ response.setResourceTypeInfo(ResourceUtils.getResourcesTypeInfo());
+ return response;
+ }
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DefaultAMSProcessor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DefaultAMSProcessor.java
index 273e0cda8b7..0baf17aa952 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DefaultAMSProcessor.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/DefaultAMSProcessor.java
@@ -111,7 +111,8 @@ public void init(ApplicationMasterServiceContext amsContext,
public void registerApplicationMaster(
ApplicationAttemptId applicationAttemptId,
RegisterApplicationMasterRequest request,
- RegisterApplicationMasterResponse response) throws IOException {
+ RegisterApplicationMasterResponse response)
+ throws IOException, YarnException {
RMApp app = getRmContext().getRMApps().get(
applicationAttemptId.getApplicationId());
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/OpportunisticContainerAllocatorAMService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/OpportunisticContainerAllocatorAMService.java
index 208300c789d..a9136d65cde 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/OpportunisticContainerAllocatorAMService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/OpportunisticContainerAllocatorAMService.java
@@ -127,7 +127,8 @@ public void init(ApplicationMasterServiceContext amsContext,
public void registerApplicationMaster(
ApplicationAttemptId applicationAttemptId,
RegisterApplicationMasterRequest request,
- RegisterApplicationMasterResponse response) throws IOException {
+ RegisterApplicationMasterResponse response)
+ throws IOException, YarnException {
final SchedulerApplicationAttempt appAttempt = ((AbstractYarnScheduler)
getScheduler()).getApplicationAttempt(applicationAttemptId);
if (appAttempt.getOpportunisticContainerContext() == null) {
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
index d6a4d2f782d..61e8a6d499a 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
@@ -69,6 +69,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.SettableFuture;
+import org.apache.hadoop.yarn.util.StringHelper;
/**
* This class manages the list of applications for the resource manager.
@@ -189,7 +190,12 @@ public static SummaryBuilder createAppSummary(RMApp app) {
.add("preemptedAMContainers", metrics.getNumAMContainersPreempted())
.add("preemptedNonAMContainers", metrics.getNumNonAMContainersPreempted())
.add("preemptedResources", metrics.getResourcePreempted())
- .add("applicationType", app.getApplicationType());
+ .add("applicationType", app.getApplicationType())
+ .add("resourceSeconds", StringHelper
+ .getResourceSecondsString(metrics.getResourceSecondsMap()))
+ .add("preemptedResourceSeconds", StringHelper
+ .getResourceSecondsString(
+ metrics.getPreemptedResourceSecondsMap()));
return summary;
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java
index 5b074488931..4a853682902 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMServerUtils.java
@@ -486,7 +486,7 @@ public static YarnApplicationAttemptState createApplicationAttemptState(
DUMMY_APPLICATION_RESOURCE_USAGE_REPORT =
BuilderUtils.newApplicationResourceUsageReport(-1, -1,
Resources.createResource(-1, -1), Resources.createResource(-1, -1),
- Resources.createResource(-1, -1), 0, 0, 0, 0);
+ Resources.createResource(-1, -1), new HashMap(), new HashMap());
/**
@@ -630,4 +630,12 @@ public static int getApplicableNodeCountForAM(RMContext rmContext,
return labelsToNodes.get(label);
}
}
+
+ public static Long getOrDefault(Map map, String key,
+ Long defaultValue) {
+ if (map.containsKey(key)) {
+ return map.get(key);
+ }
+ return defaultValue;
+ }
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
index de8386d195e..42a7e014907 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
@@ -378,10 +378,11 @@ public RegisterNodeManagerResponse registerNodeManager(
// Check if this node has minimum allocations
if (capability.getMemorySize() < minAllocMb
|| capability.getVirtualCores() < minAllocVcores) {
- String message =
- "NodeManager from " + host
- + " doesn't satisfy minimum allocations, Sending SHUTDOWN"
- + " signal to the NodeManager.";
+ String message = "NodeManager from " + host
+ + " doesn't satisfy minimum allocations, Sending SHUTDOWN"
+ + " signal to the NodeManager. Node capabilities are " + capability
+ + "; minimums are " + minAllocMb + "mb and " + minAllocVcores
+ + " vcores";
LOG.info(message);
response.setDiagnosticsMessage(message);
response.setNodeAction(NodeAction.SHUTDOWN);
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
index e8ed0b7ee65..f0ab324ace8 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
@@ -853,11 +853,8 @@ public void storeNewApplicationAttempt(RMAppAttempt appAttempt) {
appAttempt.getAppAttemptId(),
appAttempt.getMasterContainer(),
credentials, appAttempt.getStartTime(),
- resUsage.getMemorySeconds(),
- resUsage.getVcoreSeconds(),
- attempMetrics.getPreemptedMemory(),
- attempMetrics.getPreemptedVcore()
- );
+ resUsage.getResourceUsageSecondsMap(),
+ attempMetrics.getPreemptedResourceSecondsMap());
getRMStateStoreEventHandler().handle(
new RMStateStoreAppAttemptEvent(attemptState));
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java
index 67aaf947127..2de071ad2ec 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java
@@ -25,23 +25,28 @@
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos.ApplicationAttemptStateDataProto;
+import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.util.Records;
+import java.util.Map;
+
/*
* Contains the state data that needs to be persisted for an ApplicationAttempt
*/
@Public
@Unstable
public abstract class ApplicationAttemptStateData {
+
public static ApplicationAttemptStateData newInstance(
ApplicationAttemptId attemptId, Container container,
Credentials attemptTokens, long startTime, RMAppAttemptState finalState,
String finalTrackingUrl, String diagnostics,
FinalApplicationStatus amUnregisteredFinalStatus, int exitStatus,
- long finishTime, long memorySeconds, long vcoreSeconds,
- long preemptedMemorySeconds, long preemptedVcoreSeconds) {
+ long finishTime, Map resourceSecondsMap,
+ Map preemptedResourceSecondsMap) {
ApplicationAttemptStateData attemptStateData =
Records.newRecord(ApplicationAttemptStateData.class);
attemptStateData.setAttemptId(attemptId);
@@ -54,23 +59,33 @@ public static ApplicationAttemptStateData newInstance(
attemptStateData.setFinalApplicationStatus(amUnregisteredFinalStatus);
attemptStateData.setAMContainerExitStatus(exitStatus);
attemptStateData.setFinishTime(finishTime);
- attemptStateData.setMemorySeconds(memorySeconds);
- attemptStateData.setVcoreSeconds(vcoreSeconds);
- attemptStateData.setPreemptedMemorySeconds(preemptedMemorySeconds);
- attemptStateData.setPreemptedVcoreSeconds(preemptedVcoreSeconds);
+ attemptStateData.setMemorySeconds(RMServerUtils
+ .getOrDefault(resourceSecondsMap,
+ ResourceInformation.MEMORY_MB.getName(), 0L));
+ attemptStateData.setVcoreSeconds(RMServerUtils
+ .getOrDefault(resourceSecondsMap, ResourceInformation.VCORES.getName(),
+ 0L));
+ attemptStateData.setPreemptedMemorySeconds(RMServerUtils
+ .getOrDefault(preemptedResourceSecondsMap,
+ ResourceInformation.MEMORY_MB.getName(), 0L));
+ attemptStateData.setPreemptedVcoreSeconds(RMServerUtils
+ .getOrDefault(preemptedResourceSecondsMap,
+ ResourceInformation.VCORES.getName(), 0L));
+ attemptStateData.setResourceSecondsMap(resourceSecondsMap);
+ attemptStateData
+ .setPreemptedResourceSecondsMap(preemptedResourceSecondsMap);
return attemptStateData;
}
public static ApplicationAttemptStateData newInstance(
ApplicationAttemptId attemptId, Container masterContainer,
- Credentials attemptTokens, long startTime, long memorySeconds,
- long vcoreSeconds, long preemptedMemorySeconds,
- long preemptedVcoreSeconds) {
- return newInstance(attemptId, masterContainer, attemptTokens,
- startTime, null, "N/A", "", null, ContainerExitStatus.INVALID, 0,
- memorySeconds, vcoreSeconds,
- preemptedMemorySeconds, preemptedVcoreSeconds);
- }
+ Credentials attemptTokens, long startTime,
+ Map resourceSeondsMap,
+ Map preemptedResourceSecondsMap) {
+ return newInstance(attemptId, masterContainer, attemptTokens, startTime,
+ null, "N/A", "", null, ContainerExitStatus.INVALID, 0,
+ resourceSeondsMap, preemptedResourceSecondsMap);
+ }
public abstract ApplicationAttemptStateDataProto getProto();
@@ -215,4 +230,50 @@ public abstract void setFinalApplicationStatus(
@Public
@Unstable
public abstract void setPreemptedVcoreSeconds(long vcoreSeconds);
+
+ /**
+ * Get the aggregated number of resources preempted that the application has
+ * allocated times the number of seconds the application has been running.
+ *
+ * @return map containing the resource name and aggregated preempted
+ * resource-seconds
+ */
+ @Public
+ @Unstable
+ public abstract Map getResourceSecondsMap();
+
+ /**
+ * Set the aggregated number of resources that the application has
+ * allocated times the number of seconds the application has been running.
+ *
+ * @param resourceSecondsMap map containing the resource name and aggregated
+ * resource-seconds
+ */
+ @Public
+ @Unstable
+ public abstract void setResourceSecondsMap(
+ Map