diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index fe7379a5abe..581bc4c37d7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -347,6 +347,11 @@ public SchedulerHealth getSchedulerHealth() { return this.schedulerHealth; } + @VisibleForTesting + public void setSchedulerHealth(SchedulerHealth schedulerHealth) { + this.schedulerHealth = schedulerHealth; + } + protected void setLastNodeUpdateTime(long time) { this.lastNodeUpdateTime = time; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index d6fb544ee6b..bc52ad00a07 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.Set; +import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; @@ -172,6 +173,12 @@ void containerCompleted(RMContainer rmContainer, this.attemptResourceUsage.decUsed(containerResource); getQueue().decUsedResource(containerResource); + scheduler.getSchedulerHealth().updateRelease( + scheduler.getClock().getTime(), rmContainer.getNodeId(), + rmContainer.getContainerId(), rmContainer.getQueueName() + ); + scheduler.getSchedulerHealth().updateSchedulerReleaseCounts(1); + // Clear resource utilization metrics cache. lastMemoryAggregateAllocationUpdateTime = -1; } finally { @@ -463,6 +470,12 @@ public RMContainer allocate(NodeType type, FSSchedulerNode node, // Update resource requests related to "request" and store in RMContainer ((RMContainerImpl) rmContainer).setContainerRequest(containerRequest); + // Update scheduler health metrics for container allocation + scheduler.getSchedulerHealth().updateAllocation( + scheduler.getClock().getTime(), rmContainer.getNodeId(), + rmContainer.getContainerId(), queue.getQueueName()); + scheduler.getSchedulerHealth().updateSchedulerAllocationCounts(1); + // Inform the container rmContainer.handle( new RMContainerEvent(container.getId(), RMContainerEventType.START)); @@ -687,8 +700,9 @@ public synchronized void recoverContainer(SchedulerNode node, * in {@link FSSchedulerNode}.. * return whether reservation was possible with the current threshold limits */ - private boolean reserve(Resource perAllocationResource, FSSchedulerNode node, - Container reservedContainer, NodeType type, + @VisibleForTesting + protected boolean reserve(Resource perAllocationResource, + FSSchedulerNode node, Container reservedContainer, NodeType type, SchedulerRequestKey schedulerKey) { RMContainer nodeReservedContainer = node.getReservedContainer(); @@ -708,11 +722,22 @@ private boolean reserve(Resource perAllocationResource, FSSchedulerNode node, super.reserve(node, schedulerKey, null, reservedContainer); node.reserveResource(this, schedulerKey, rmContainer); setReservation(node); + scheduler.getSchedulerHealth().updateReservation( + scheduler.getClock().getTime(), node.getNodeID(), + reservedContainer.getId(), getQueue().getName() + ); + scheduler.getSchedulerHealth().updateSchedulerReservationCounts(1); + } else { RMContainer rmContainer = node.getReservedContainer(); super.reserve(node, schedulerKey, rmContainer, reservedContainer); node.reserveResource(this, schedulerKey, rmContainer); setReservation(node); + scheduler.getSchedulerHealth().updateReservation( + scheduler.getClock().getTime(), node.getNodeID(), + reservedContainer.getId(), getQueue().getName() + ); + scheduler.getSchedulerHealth().updateSchedulerReservationCounts(1); } return true; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java index e664725af31..c86b973f03a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java @@ -273,6 +273,12 @@ public void run() { ContainerStatus status = SchedulerUtils.createPreemptedContainerStatus( container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER); + // Updates scheduler health metrcis for preemption + scheduler.getSchedulerHealth().updatePreemption( + scheduler.getClock().getTime(), container.getNodeId(), + container.getContainerId(), container.getQueueName()); + scheduler.getSchedulerHealth().updateSchedulerPreemptionCounts(1); + LOG.info("Killing container " + container); scheduler.completedContainer( container, status, RMContainerEventType.KILL); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 151a7ab0867..b1f5c4c440e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -730,6 +730,8 @@ protected void completedContainerInternal( + " released container " + container.getId() + " on node: " + (node == null ? nodeID : node) + " with event: " + event); } + schedulerHealth.updateRelease(getClock().getTime(), nodeID, + container.getId(), application.getQueue().getName()); } finally { writeLock.unlock(); } @@ -1112,6 +1114,7 @@ void attemptScheduling(FSSchedulerNode node) { boolean validReservation = false; if (reservedAppSchedulable != null) { validReservation = reservedAppSchedulable.assignReservedContainer(node); + schedulerHealth.updateSchedulerFulfilledReservationCounts(1); } if (!validReservation) { // No reservation, schedule at queue which is farthest below fair share @@ -1135,6 +1138,10 @@ void attemptScheduling(FSSchedulerNode node) { break; } } + + // TODO: Reserved resource + schedulerHealth.updateSchedulerRunDetails(getClock().getTime(), + assignedResource, Resources.none()); } updateRootQueueMetrics(); } finally { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java index 7f31defa066..e4de6c8c555 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java @@ -21,14 +21,18 @@ import static org.apache.hadoop.yarn.util.StringHelper.join; import java.util.Collection; +import java.util.HashMap; +import java.util.Map; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerLeafQueueInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerQueueInfo; import org.apache.hadoop.yarn.server.webapp.WebPageUtils; +import org.apache.hadoop.yarn.util.Times; import org.apache.hadoop.yarn.webapp.ResponseInfo; import org.apache.hadoop.yarn.webapp.SubView; import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; @@ -171,6 +175,117 @@ public void render(Block html) { ul.__(); } } + + /** + * Scheduler health block + */ + public static class HealthBlock extends HtmlBlock { + + private final FairScheduler fs; + + @Inject + HealthBlock(ResourceManager rm) { + fs = (FairScheduler) rm.getResourceScheduler(); + } + + @Override + public void render(HtmlBlock.Block html) { + SchedulerHealth healthInfo = fs.getSchedulerHealth(); + UL>> ul = html. + div("#fs-health-wrapper.ui-widget"). + div(".ui-widget-header.ui-corner-top"). + __("Fair scheduler health metrics ▼").__(). + div("#fs-health.ui-widget-content.ui-corner-bottom"). + ul(); + + DIV div = html.div("#health"); + div.h4("Aggregate scheduler counts"); + Hamlet.TBODY>> tbody = + div.table("#lastrun").thead().$class("ui-widget-header") + .tr().th() + .$class("ui-state-default") + .__("Total Container Allocations" + + "(count)") + .__().th().$class("ui-state-default") + .__("Total Container Releases(count)").__().th() + .$class("ui-state-default") + .__("Total Fulfilled Reservations(count)").__().th() + .$class("ui-state-default") + .__("Total Container Preemptions(count)") + .__().__().__().tbody(); + tbody + .$class("ui-widget-content") + .tr() + .td( + String.valueOf(fs.getRootQueueMetrics() + .getAggregateAllocatedContainers())) + .td( + String.valueOf(fs.getRootQueueMetrics() + .getAggegatedReleasedContainers())) + .td(healthInfo.getAggregateFulFilledReservationsCount().toString()) + .td(healthInfo.getAggregatePreemptionCount().toString()).__().__().__(); + div.h4("Last scheduler run"); + tbody = + div.table("#lastrun").thead().$class("ui-widget-header").tr() + .th() + .$class("ui-state-default").__("Time").__().th() + .$class("ui-state-default").__("Allocations(count - resources)") + .__() + .th().$class("ui-state-default") + .__().th().$class("ui-state-default") + .__("Releases(count - resources)") + .__().__().__().tbody(); + tbody + .$class("ui-widget-content") + .tr() + .td(Times.format(healthInfo.getLastSchedulerRunTime())) + .td( + healthInfo.getAllocationCount().toString() + " - " + + healthInfo.getResourcesAllocated().toString()) + .td( + healthInfo.getReservationCount().toString() + " - " + + healthInfo.getResourcesReserved().toString()) + .td( + healthInfo.getReleaseCount().toString() + " - " + + healthInfo.getResourcesReleased().toString()) + .__().__().__(); + Map info = new HashMap<>(); + info.put("Allocation", healthInfo.getLastAllocationDetails()); + info.put("Reservation", healthInfo.getLastReservationDetails()); + info.put("Release", healthInfo.getLastReleaseDetails()); + info.put("Preemption", healthInfo.getLastPreemptionDetails()); + + for (Map.Entry entry : info + .entrySet()) { + String containerId = "N/A"; + String nodeId = "N/A"; + String queue = "N/A"; + String table = "#" + entry.getKey(); + div.h4("Last " + entry.getKey()); + tbody = + div.table(table).thead().$class("ui-widget-header").tr().th() + .$class("ui-state-default").__("Time").__().th() + .$class("ui-state-default").__("Container Id").__().th() + .$class("ui-state-default").__("Node Id").__().th() + .$class("ui-state-default").__("Queue").__().__().__().tbody(); + SchedulerHealth.DetailedInformation di = entry.getValue(); + if (di.getTimestamp() != 0) { + if (di.getContainerId() != null) { + containerId = di.getContainerId().toString(); + } + if (di.getNodeId() != null) { + nodeId = di.getNodeId().toString(); + } + queue = di.getQueue(); + } + tbody.$class("ui-widget-content").tr() + .td(Times.format(di.getTimestamp())).td(containerId).td(nodeId) + .td(queue).__().__().__(); + } + div.__(); + ul.__().__().__(); + } + } static class QueuesBlock extends HtmlBlock { final FairScheduler fs; @@ -184,6 +299,7 @@ public void render(Block html) { @Override public void render(Block html) { html.__(MetricsOverviewTable.class); + html.__(HealthBlock.class); UL>> ul = html. div("#cs-wrapper.ui-widget"). div(".ui-widget-header.ui-corner-top"). @@ -238,18 +354,25 @@ public void render(Block html) { @Override protected void postHead(Page.HTML<__> html) { html. style().$type("text/css"). - __("#cs { padding: 0.5em 0 1em 0; margin-bottom: 1em; position: relative }", + __( + "#cs { padding: 0.5em 0 1em 0; margin-bottom: 1em; " + + "position: relative }", "#cs ul { list-style: none }", "#cs a { font-weight: normal; margin: 2px; position: relative }", "#cs a span { font-weight: normal; font-size: 80% }", "#cs-wrapper .ui-widget-header { padding: 0.2em 0.5em }", + "#fs-health-wrapper {margin-bottom: 1em }", + "#fs-health {padding: 0.2em }", + "#fs-health-wrapper .ui-widget-header { padding: 0.2em 0.5em; " + + "cursor: pointer; }", ".qstats { font-weight: normal; font-size: 80%; position: absolute }", ".qlegend { font-weight: normal; padding: 0 1em; margin: 1em }", "table.info tr th {width: 50%}").__(). // to center info table script("/static/jt/jquery.jstree.js"). script().$type("text/javascript"). __("$(function() {", - " $('#cs a span').addClass('ui-corner-all').css('position', 'absolute');", + " $('#cs a span').addClass('ui-corner-all').css('position', " + + "'absolute');", " $('#cs').bind('loaded.jstree', function (e, data) {", " var callback = { call:reopenQueryNodes }", " data.inst.open_node('#pq', callback);", @@ -268,6 +391,18 @@ public void render(Block html) { " $('#apps').dataTable().fnFilter(q, 4, true);", " });", " $('#cs').show();", + " if('health' != window.location.hash.substring(1)) { " + + "$('#fs-health').hide();}", + " $(\"#fs-health-wrapper > div.ui-widget-header.ui-corner-top\")" + + ".click(", + " function() {", + " $(\"#fs-health\").slideToggle('slow',", + " function() {", + " window.location.hash = ($(\"#fs-health\").is", + "(\":visible\"))?\"health\":\"\";", + " });", + " }", + " ); ", "});").__(). __(SchedulerPageUtil.QueueBlockUtil.class); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java index 6d11898cd53..32997f739d1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java @@ -25,7 +25,7 @@ import java.util.concurrent.ConcurrentMap; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.*; import org.apache.hadoop.yarn.server.resourcemanager.MockNodes; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import static org.junit.Assert.assertEquals; @@ -34,6 +34,7 @@ import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; @@ -46,6 +47,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth; import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity @@ -57,6 +59,7 @@ import org.apache.hadoop.yarn.util.ControlledClock; import org.apache.hadoop.yarn.util.resource.Resources; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; @@ -169,7 +172,6 @@ public void testDelaySchedulingForContinuousScheduling() // manually set back to node local schedulerApp.resetAllowedLocalityLevel(prio, NodeType.NODE_LOCAL); - schedulerApp.resetSchedulingOpportunities(prio, clock.getTime()); assertEquals(NodeType.NODE_LOCAL, schedulerApp.getAllowedLocalityLevelByTime(prio, nodeLocalityDelayMs, rackLocalityDelayMs, clock.getTime())); @@ -384,6 +386,37 @@ public void testNoNextPendingAsk() { assertEquals(Resources.none(), resource); } + @Test + public void testHealthMetric() { + //SETUP + FSLeafQueue queue = Mockito.mock(FSLeafQueue.class); + FSQueueMetrics queueMetrics = Mockito.mock(FSQueueMetrics.class); + when(queue.getMetrics()).thenReturn(queueMetrics); + + ApplicationAttemptId applicationAttemptId = createAppAttemptId(1, 1); + RMContext rmContext = resourceManager.getRMContext(); + FSAppAttempt schedulerApp = + new FSAppAttempt(scheduler, applicationAttemptId, "user1", queue, + null, rmContext); + scheduler.setSchedulerHealth(new SchedulerHealth()); + Resource perAllocationResource = Resource.newInstance(1024, 1); + + FSSchedulerNode node = Mockito.mock(FSSchedulerNode.class); + RMNode rmNode = Mockito.mock(RMNode.class); + Mockito.when(rmNode.getNodeID()).thenReturn(NodeId.newInstance("host", + 1234)); + Mockito.when(node.getRMNode()).thenReturn(rmNode); + SchedulerRequestKey schedulerKey = Mockito.mock(SchedulerRequestKey.class); + + //ACT + schedulerApp.reserve(perAllocationResource, node, null, + NodeType.NODE_LOCAL, schedulerKey); + + //ASSERT + Assert.assertEquals(Long.valueOf(1), + scheduler.getSchedulerHealth().getReservationCount()); + } + private static long min(long value1, long value2, long value3) { return Math.min(Math.min(value1, value2), value3); }