diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java index f51bae63dc6..11cdf150ddb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java @@ -154,4 +154,11 @@ public void setMetrics(EventTypeMetrics metrics) { this.metrics = metrics; } + protected long getEventProcessorId() { + return this.eventProcessor.getId(); + } + + protected boolean isStopped() { + return this.stopped; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java index 7fe5cc9703b..d46785cdf1c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java @@ -59,6 +59,10 @@ @Metric("Memory Capability") MutableGaugeLong capabilityMB; @Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores; @Metric("GPU Capability") MutableGaugeLong capabilityGPUs; + @Metric("RM Event Processor CPU Usage 60 second Avg") MutableGaugeLong + rmEventProcCPUAvg; + @Metric("RM Event Processor CPU Usage 60 second Max") MutableGaugeLong + rmEventProcCPUMax; private static final MetricsInfo RECORD_INFO = info("ClusterMetrics", "Metrics for the Yarn Cluster"); @@ -94,6 +98,19 @@ public synchronized static void destroy() { INSTANCE = null; } + //RM Event Processor CPU Usage + public long getRmEventProcCPUAvg() { + return rmEventProcCPUAvg.value(); + } + public void setRmEventProcCPUAvg(long value) { + rmEventProcCPUAvg.set(value); + } + public long getRmEventProcCPUMax() { + return rmEventProcCPUMax.value(); + } + public void setRmEventProcCPUMax(long value) { + rmEventProcCPUMax.set(value); + } //Active Nodemanagers public int getNumActiveNMs() { return numActiveNMs.value(); @@ -274,4 +291,4 @@ public long getUtilizedVirtualCores() { public void incrUtilizedVirtualCores(long delta) { utilizedVirtualCores.incr(delta); } -} \ No newline at end of file +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 6d2a9fed08b..f41cd270995 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -49,8 +49,9 @@ import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.VersionInfo; +import org.apache.hadoop.util.Time; import org.apache.hadoop.util.curator.ZKCuratorManager; +import org.apache.hadoop.util.VersionInfo; import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; @@ -138,6 +139,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadMXBean; import java.net.InetSocketAddress; import java.net.URI; import java.net.URL; @@ -451,8 +454,8 @@ protected void setRMStateStore(RMStateStore rmStore) { } protected EventHandler createSchedulerEventDispatcher() { - EventDispatcher dispatcher = new - EventDispatcher(this.scheduler, "SchedulerEventDispatcher"); + EventDispatcher dispatcher = + new SchedulerEventDispatcher("SchedulerEventDispatcher"); dispatcher. setMetrics(GenericEventTypeMetricsManager. create(dispatcher.getName(), SchedulerEventType.class)); @@ -1018,7 +1021,90 @@ public void handle(RMFatalEvent event) { } } - /** + @Private + private class SchedulerEventDispatcher extends + EventDispatcher { + + private final Thread eventProcessorMonitor; + + SchedulerEventDispatcher(String name) { + super(scheduler, name); + this.eventProcessorMonitor = + new Thread(new EventProcessorMonitor(getEventProcessorId())); + this.eventProcessorMonitor + .setName("ResourceManager Event Processor Monitor"); + } + // EventProcessorMonitor keeps track of how much CPU the EventProcessor + // thread is using. It takes 1 second samples and then reports the Avg + // and Max of previous 60 seconds as cluster metrics. Units are + // usecs per second of CPU used. + // Avg is not accurate until 60 samples have been received. + private final class EventProcessorMonitor implements Runnable { + private final long tid; + private final boolean run; + private final ThreadMXBean tmxb; + private final ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics(); + EventProcessorMonitor(long id) { + this.tid = id; + this.tmxb = ManagementFactory.getThreadMXBean(); + if (clusterMetrics != null && + tmxb != null && tmxb.isThreadCpuTimeSupported()) { + this.run = true; + } else { + this.run = false; + } + } + public void run() { + final int samples = 60; + int index = 0; + long[] values = new long[samples]; + + while (run && !isStopped() && !Thread.currentThread().isInterrupted()) { + try { + long cpuBefore = tmxb.getThreadCpuTime(tid); + long wallClockBefore = Time.monotonicNow(); + Thread.sleep(1000); + long wallClockDelta = Time.monotonicNow() - wallClockBefore; + long cpuDelta = tmxb.getThreadCpuTime(tid) - cpuBefore; + + // Nanoseconds / Milliseconds = usec per second + values[index] = cpuDelta / wallClockDelta; + + index = (index + 1) % samples; + long max = 0; + long sum = 0; + for (int i = 0; i < samples; i++) { + sum += values[i]; + max = Math.max(max, values[i]); + } + clusterMetrics.setRmEventProcCPUAvg(sum / samples); + clusterMetrics.setRmEventProcCPUMax(max); + } catch (InterruptedException e) { + LOG.error("Returning, interrupted : " + e); + return; + } + } + } + } + @Override + protected void serviceStart() throws Exception { + super.serviceStart(); + this.eventProcessorMonitor.start(); + } + + @Override + protected void serviceStop() throws Exception { + super.serviceStop(); + this.eventProcessorMonitor.interrupt(); + try { + this.eventProcessorMonitor.join(); + } catch (InterruptedException e) { + throw new YarnRuntimeException(e); + } + } + } + + /** * Transition to standby state in a new thread. The transition operation is * asynchronous to avoid deadlock caused by cyclic dependency. */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java index 97e43e636ca..fec9347a053 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java @@ -215,8 +215,9 @@ protected void render(Block html) { th().$class("ui-state-default").__("Maximum Allocation").__(). th().$class("ui-state-default") .__("Maximum Cluster Application Priority").__(). - __(). - __(). + th().$class("ui-state-default").__("Scheduler Busy %").__(). + __(). + __(). tbody().$class("ui-widget-content"). tr(). td(String.valueOf(schedulerInfo.getSchedulerType())). @@ -225,8 +226,10 @@ protected void render(Block html) { td(schedulerInfo.getMinAllocation().toString()). td(schedulerInfo.getMaxAllocation().toString()). td(String.valueOf(schedulerInfo.getMaxClusterLevelAppPriority())). + td(String.valueOf(clusterMetrics.getRmSchedulerBusyPercent())). __(). - __().__(); + __(). + __(); div.__(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java index b66c4d997a7..d635792a07c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java @@ -57,6 +57,7 @@ private long totalVirtualCores; private int utilizedMBPercent; private int utilizedVirtualCoresPercent; + private int rmSchedulerBusyPercent; private int totalNodes; private int lostNodes; private int unhealthyNodes; @@ -143,7 +144,9 @@ public ClusterMetricsInfo(final ResourceScheduler rs) { this.utilizedVirtualCoresPercent = baseCores <= 0 ? 0 : (int) (clusterMetrics.getUtilizedVirtualCores() * 100 / baseCores); - + // Scheduler Busy is in usec per sec, so to get percent divide by 10^4 + this.rmSchedulerBusyPercent = + (int)(clusterMetrics.getRmEventProcCPUAvg() / 10000L); this.activeNodes = clusterMetrics.getNumActiveNMs(); this.lostNodes = clusterMetrics.getNumLostNMs(); this.unhealthyNodes = clusterMetrics.getUnhealthyNMs(); @@ -271,6 +274,10 @@ public int getUtilizedVirtualCoresPercent() { return utilizedVirtualCoresPercent; } + public int getRmSchedulerBusyPercent() { + return rmSchedulerBusyPercent; + } + public void setContainersReserved(int containersReserved) { this.containersReserved = containersReserved; } @@ -383,6 +390,10 @@ public void setUtilizedVirtualCoresPercent(int utilizedVirtualCoresPercent) { this.utilizedVirtualCoresPercent = utilizedVirtualCoresPercent; } + public void setRmSchedulerBusyPercent(int rmSchedulerBusyPercent) { + this.rmSchedulerBusyPercent = rmSchedulerBusyPercent; + } + public ResourceInfo getTotalClusterResourcesAcrossPartition() { return totalClusterResourcesAcrossPartition; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java index 9b79938d372..9ab6583b06c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java @@ -52,7 +52,7 @@ // Number of Actual Table Headers for NodesPage.NodesBlock might change in // future. In that case this value should be adjusted to the new value. - private final int numberOfThInMetricsTable = 22; + private final int numberOfThInMetricsTable = 23; private final int numberOfActualTableHeaders = 18; private final int numberOfThForOpportunisticContainers = 4; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java index 5785b141144..02094327f82 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java @@ -474,7 +474,7 @@ public void verifyClusterMetricsJSON(JSONObject json) throws JSONException, Exception { assertEquals("incorrect number of elements", 1, json.length()); JSONObject clusterinfo = json.getJSONObject("clusterMetrics"); - assertEquals("incorrect number of elements", 31, clusterinfo.length()); + assertEquals("incorrect number of elements", 32, clusterinfo.length()); verifyClusterMetrics( clusterinfo.getInt("appsSubmitted"), clusterinfo.getInt("appsCompleted"), clusterinfo.getInt("reservedMB"), clusterinfo.getInt("availableMB"),