diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
index c39e52ede73..0f26106f231 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
@@ -20,8 +20,11 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.slf4j.Logger;
@@ -46,6 +49,10 @@
/** Resource calculator. */
private ResourceCalculatorPlugin resourceCalculatorPlugin;
+ /** Gpu related plugin. */
+ private GpuResourcePlugin gpuResourcePlugin;
+ private GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler;
+
/** Current resource utilization of the node. */
private ResourceUtilization nodeUtilization =
ResourceUtilization.newInstance(0, 0, 0f);
@@ -72,6 +79,15 @@ protected void serviceInit(Configuration conf) throws Exception {
this.resourceCalculatorPlugin =
ResourceCalculatorPlugin.getNodeResourceMonitorPlugin(conf);
+ this.gpuResourcePlugin = (GpuResourcePlugin)nmContext.getResourcePluginManager().
+ getNameToPlugins().get(ResourceInformation.GPU_URI);
+
+ if (gpuResourcePlugin != null) {
+ this.gpuNodeResourceUpdateHandler =
+ (GpuNodeResourceUpdateHandler)gpuResourcePlugin.
+ getNodeResourceHandlerInstance();
+ }
+
LOG.info(" Using ResourceCalculatorPlugin : "
+ this.resourceCalculatorPlugin);
}
@@ -152,6 +168,14 @@ public void run() {
(int) (vmem >> 20), // B -> MB
vcores); // Used Virtual Cores
+ float nodeGpuUtilization = 0F;
+ try {
+ nodeGpuUtilization =
+ gpuNodeResourceUpdateHandler.getNodeGPUUtilization();
+ } catch (Exception e) {
+ LOG.error("Get Node GPU Utilization error: " + e);
+ }
+
// Publish the node utilization metrics to node manager
// metrics system.
NodeManagerMetrics nmMetrics = nmContext.getNodeManagerMetrics();
@@ -159,6 +183,7 @@ public void run() {
nmMetrics.setNodeUsedMemGB(nodeUtilization.getPhysicalMemory());
nmMetrics.setNodeUsedVMemGB(nodeUtilization.getVirtualMemory());
nmMetrics.setNodeCpuUtilization(nodeUtilization.getCPU());
+ nmMetrics.setNodeGpuUtilization(nodeGpuUtilization);
}
try {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
index afb0d7eda23..0cc46b73d41 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -26,6 +26,7 @@
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -76,4 +77,16 @@ public void updateConfiguredResource(Resource res) throws YarnException {
res.setResourceValue(GPU_URI, nUsableGpus);
}
+
+ public float getNodeGPUUtilization() throws Exception{
+ List gpuList =
+ gpuDiscoverer.getGpuDeviceInformation().getGpus();
+ Float totalGpuUtilization = 0F;
+ for (PerGpuDeviceInformation gpu : gpuList) {
+ totalGpuUtilization +=
+ gpu.getGpuUtilizations().getOverallGpuUtilization();
+ }
+ totalGpuUtilization = totalGpuUtilization / gpuList.size();
+ return totalGpuUtilization;
+ }
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
index abe45298168..848b9445289 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
@@ -98,6 +98,8 @@
MutableGaugeInt nodeUsedVMemGB;
@Metric("Current CPU utilization")
MutableGaugeFloat nodeCpuUtilization;
+ @Metric("Current GPU utilization")
+ MutableGaugeFloat nodeGpuUtilization;
@Metric("Missed localization requests in bytes")
MutableCounterLong localizedCacheMissBytes;
@@ -428,6 +430,14 @@ public void setNodeCpuUtilization(float cpuUtilization) {
this.nodeCpuUtilization.set(cpuUtilization);
}
+ public void setNodeGpuUtilization(float nodeGpuUtilization) {
+ this.nodeGpuUtilization.set(nodeGpuUtilization);
+ }
+
+ public float getNodeGpuUtilization() {
+ return nodeGpuUtilization.value();
+ }
+
private void updateLocalizationHitRatios() {
updateLocalizationHitRatio(localizedCacheHitBytes, localizedCacheMissBytes,
localizedCacheHitBytesRatio);