diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java index c39e52ede73..0f26106f231 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java @@ -20,8 +20,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.api.records.ResourceUtilization; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin; import org.slf4j.Logger; @@ -46,6 +49,10 @@ /** Resource calculator. */ private ResourceCalculatorPlugin resourceCalculatorPlugin; + /** Gpu related plugin. */ + private GpuResourcePlugin gpuResourcePlugin; + private GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler; + /** Current resource utilization of the node. */ private ResourceUtilization nodeUtilization = ResourceUtilization.newInstance(0, 0, 0f); @@ -72,6 +79,15 @@ protected void serviceInit(Configuration conf) throws Exception { this.resourceCalculatorPlugin = ResourceCalculatorPlugin.getNodeResourceMonitorPlugin(conf); + this.gpuResourcePlugin = (GpuResourcePlugin)nmContext.getResourcePluginManager(). + getNameToPlugins().get(ResourceInformation.GPU_URI); + + if (gpuResourcePlugin != null) { + this.gpuNodeResourceUpdateHandler = + (GpuNodeResourceUpdateHandler)gpuResourcePlugin. + getNodeResourceHandlerInstance(); + } + LOG.info(" Using ResourceCalculatorPlugin : " + this.resourceCalculatorPlugin); } @@ -152,6 +168,14 @@ public void run() { (int) (vmem >> 20), // B -> MB vcores); // Used Virtual Cores + float nodeGpuUtilization = 0F; + try { + nodeGpuUtilization = + gpuNodeResourceUpdateHandler.getNodeGPUUtilization(); + } catch (Exception e) { + LOG.error("Get Node GPU Utilization error: " + e); + } + // Publish the node utilization metrics to node manager // metrics system. NodeManagerMetrics nmMetrics = nmContext.getNodeManagerMetrics(); @@ -159,6 +183,7 @@ public void run() { nmMetrics.setNodeUsedMemGB(nodeUtilization.getPhysicalMemory()); nmMetrics.setNodeUsedVMemGB(nodeUtilization.getVirtualMemory()); nmMetrics.setNodeCpuUtilization(nodeUtilization.getCPU()); + nmMetrics.setNodeGpuUtilization(nodeGpuUtilization); } try { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index afb0d7eda23..0cc46b73d41 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -26,6 +26,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -76,4 +77,16 @@ public void updateConfiguredResource(Resource res) throws YarnException { res.setResourceValue(GPU_URI, nUsableGpus); } + + public float getNodeGPUUtilization() throws Exception{ + List gpuList = + gpuDiscoverer.getGpuDeviceInformation().getGpus(); + Float totalGpuUtilization = 0F; + for (PerGpuDeviceInformation gpu : gpuList) { + totalGpuUtilization += + gpu.getGpuUtilizations().getOverallGpuUtilization(); + } + totalGpuUtilization = totalGpuUtilization / gpuList.size(); + return totalGpuUtilization; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index abe45298168..848b9445289 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -98,6 +98,8 @@ MutableGaugeInt nodeUsedVMemGB; @Metric("Current CPU utilization") MutableGaugeFloat nodeCpuUtilization; + @Metric("Current GPU utilization") + MutableGaugeFloat nodeGpuUtilization; @Metric("Missed localization requests in bytes") MutableCounterLong localizedCacheMissBytes; @@ -428,6 +430,14 @@ public void setNodeCpuUtilization(float cpuUtilization) { this.nodeCpuUtilization.set(cpuUtilization); } + public void setNodeGpuUtilization(float nodeGpuUtilization) { + this.nodeGpuUtilization.set(nodeGpuUtilization); + } + + public float getNodeGpuUtilization() { + return nodeGpuUtilization.value(); + } + private void updateLocalizationHitRatios() { updateLocalizationHitRatio(localizedCacheHitBytes, localizedCacheMissBytes, localizedCacheHitBytesRatio);