Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java (date 1540538396000) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFSAppAttempt.java (date 1542962249000) @@ -22,7 +22,7 @@ import java.util.List; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.*; import org.apache.hadoop.yarn.server.resourcemanager.MockNodes; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import static org.junit.Assert.assertEquals; @@ -31,13 +31,15 @@ import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; -import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; -import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PendingAsk; import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity @@ -49,6 +51,7 @@ import org.apache.hadoop.yarn.util.ControlledClock; import org.apache.hadoop.yarn.util.resource.Resources; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; @@ -161,7 +164,6 @@ // manually set back to node local schedulerApp.resetAllowedLocalityLevel(prio, NodeType.NODE_LOCAL); - schedulerApp.resetSchedulingOpportunities(prio, clock.getTime()); assertEquals(NodeType.NODE_LOCAL, schedulerApp.getAllowedLocalityLevelByTime(prio, nodeLocalityDelayMs, rackLocalityDelayMs, clock.getTime())); @@ -341,6 +343,37 @@ assertEquals(clusterResource, spyApp.getHeadroom()); } + @Test + public void testHealthMetric() { + //SETUP + FSLeafQueue queue = Mockito.mock(FSLeafQueue.class); + FSQueueMetrics queueMetrics = Mockito.mock(FSQueueMetrics.class); + when(queue.getMetrics()).thenReturn(queueMetrics); + + ApplicationAttemptId applicationAttemptId = createAppAttemptId(1, 1); + RMContext rmContext = resourceManager.getRMContext(); + FSAppAttempt schedulerApp = + new FSAppAttempt(scheduler, applicationAttemptId, "user1", queue, + null, rmContext); + scheduler.setSchedulerHealth(new SchedulerHealth()); + Resource perAllocationResource = Resource.newInstance(1024,1); + + FSSchedulerNode node = Mockito.mock(FSSchedulerNode.class); + RMNode rmNode = Mockito.mock(RMNode.class); + Mockito.when(rmNode.getNodeID()).thenReturn(NodeId.newInstance("host", + 1234)); + Mockito.when(node.getRMNode()).thenReturn(rmNode); + SchedulerRequestKey schedulerKey = Mockito.mock(SchedulerRequestKey.class); + + //ACT + schedulerApp.reserve(perAllocationResource, node, null, + NodeType.NODE_LOCAL, schedulerKey); + + //ASSERT + Assert.assertEquals(Long.valueOf(1), + scheduler.getSchedulerHealth().getReservationCount()); + } + private static long min(long value1, long value2, long value3) { return Math.min(Math.min(value1, value2), value3); } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java (date 1540538396000) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java (date 1542120675000) @@ -253,6 +253,12 @@ ContainerStatus status = SchedulerUtils.createPreemptedContainerStatus( container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER); + // Updates scheduler health metrcis for preemption + scheduler.getSchedulerHealth().updatePreemption( + scheduler.getClock().getTime(), container.getNodeId(), + container.getContainerId(), container.getQueueName()); + scheduler.getSchedulerHealth().updateSchedulerPreemptionCounts(1); + LOG.info("Killing container " + container); scheduler.completedContainer( container, status, RMContainerEventType.KILL); Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java (date 1540538396000) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java (date 1542617415000) @@ -28,6 +28,7 @@ import java.util.Map; import java.util.Set; +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; @@ -171,6 +172,9 @@ this.attemptResourceUsage.decUsed(containerResource); getQueue().decUsedResource(containerResource); + scheduler.getSchedulerHealth().updateRelease(scheduler.getClock().getTime(), rmContainer.getNodeId(), rmContainer.getContainerId(), rmContainer.getQueueName()); + scheduler.getSchedulerHealth().updateSchedulerReleaseCounts(1); + // Clear resource utilization metrics cache. lastMemoryAggregateAllocationUpdateTime = -1; } finally { @@ -468,6 +472,12 @@ // Update resource requests related to "request" and store in RMContainer ((RMContainerImpl) rmContainer).setContainerRequest(containerRequest); + // Update scheduler health metrics for container allocation + scheduler.getSchedulerHealth().updateAllocation( + scheduler.getClock().getTime(), rmContainer.getNodeId(), + rmContainer.getContainerId(), queue.getQueueName()); + scheduler.getSchedulerHealth().updateSchedulerAllocationCounts(1); + // Inform the container rmContainer.handle( new RMContainerEvent(container.getId(), RMContainerEventType.START)); @@ -693,7 +703,8 @@ * in {@link FSSchedulerNode}.. * return whether reservation was possible with the current threshold limits */ - private boolean reserve(Resource perAllocationResource, FSSchedulerNode node, + @VisibleForTesting + protected boolean reserve(Resource perAllocationResource, FSSchedulerNode node, Container reservedContainer, NodeType type, SchedulerRequestKey schedulerKey) { @@ -714,11 +725,22 @@ super.reserve(node, schedulerKey, null, reservedContainer); node.reserveResource(this, schedulerKey, rmContainer); setReservation(node); + scheduler.getSchedulerHealth().updateReservation( + scheduler.getClock().getTime(), node.getNodeID(), + reservedContainer.getId(), getQueue().getName() + ); + scheduler.getSchedulerHealth().updateSchedulerReservationCounts(1); + } else { RMContainer rmContainer = node.getReservedContainer(); super.reserve(node, schedulerKey, rmContainer, reservedContainer); node.reserveResource(this, schedulerKey, rmContainer); setReservation(node); + scheduler.getSchedulerHealth().updateReservation( + scheduler.getClock().getTime(), node.getNodeID(), + reservedContainer.getId(), getQueue().getName() + ); + scheduler.getSchedulerHealth().updateSchedulerReservationCounts(1); } return true; } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java (date 1540538396000) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java (date 1542617720000) @@ -332,6 +332,11 @@ return this.schedulerHealth; } + @VisibleForTesting + public void setSchedulerHealth(SchedulerHealth schedulerHealth) { + this.schedulerHealth = schedulerHealth; + } + protected void setLastNodeUpdateTime(long time) { this.lastNodeUpdateTime = time; } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java (date 1540538396000) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java (date 1542120675000) @@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; +import org.apache.hadoop.util.Time; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; @@ -43,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ResourceOption; import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.SchedulingRequest; +import org.apache.hadoop.yarn.api.records.impl.pb.NodeIdPBImpl; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions .SchedulerInvalidResoureRequestException; @@ -78,6 +80,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils.MaxResourceValidationResult; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.AssignmentInformation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; @@ -764,6 +768,8 @@ + " released container " + container.getId() + " on node: " + (node == null ? nodeID : node) + " with event: " + event); } + schedulerHealth.updateRelease(getClock().getTime(), nodeID, + container.getId(), application.getQueue().getName()); } finally { writeLock.unlock(); } @@ -1146,6 +1152,7 @@ boolean validReservation = false; if (reservedAppSchedulable != null) { validReservation = reservedAppSchedulable.assignReservedContainer(node); + schedulerHealth.updateSchedulerFulfilledReservationCounts(1); } if (!validReservation) { // No reservation, schedule at queue which is farthest below fair share @@ -1171,6 +1178,10 @@ break; } } + + // TODO: Reserved resource + schedulerHealth.updateSchedulerRunDetails(getClock().getTime(), + assignedResource, Resources.none()); } updateRootQueueMetrics(); } finally { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java (date 1540538396000) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/FairSchedulerPage.java (date 1542120675000) @@ -21,14 +21,18 @@ import static org.apache.hadoop.yarn.util.StringHelper.join; import java.util.Collection; +import java.util.HashMap; +import java.util.Map; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerLeafQueueInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.FairSchedulerQueueInfo; import org.apache.hadoop.yarn.server.webapp.WebPageUtils; +import org.apache.hadoop.yarn.util.Times; import org.apache.hadoop.yarn.webapp.ResponseInfo; import org.apache.hadoop.yarn.webapp.SubView; import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; @@ -171,6 +175,106 @@ ul.__(); } } + + public static class HealthBlock extends HtmlBlock { + + final FairScheduler fs; + + @Inject + HealthBlock(ResourceManager rm) { + fs = (FairScheduler) rm.getResourceScheduler(); + } + + @Override + public void render(HtmlBlock.Block html) { + SchedulerHealth healthInfo = fs.getSchedulerHealth(); + UL>> ul = html. + div("#fs-health-wrapper.ui-widget"). + div(".ui-widget-header.ui-corner-top"). + __("Fair scheduler health metrics ▼").__(). + div("#fs-health.ui-widget-content.ui-corner-bottom"). + ul(); + + DIV div = html.div("#health"); + div.h4("Aggregate scheduler counts"); + Hamlet.TBODY>> tbody = + div.table("#lastrun").thead().$class("ui-widget-header").tr().th() + .$class("ui-state-default").__("Total Container Allocations(count)") + .__().th().$class("ui-state-default") + .__("Total Container Releases(count)").__().th() + .$class("ui-state-default") + .__("Total Fulfilled Reservations(count)").__().th() + .$class("ui-state-default").__("Total Container Preemptions(count)") + .__().__().__().tbody(); + tbody + .$class("ui-widget-content") + .tr() + .td( + String.valueOf(fs.getRootQueueMetrics() + .getAggregateAllocatedContainers())) + .td( + String.valueOf(fs.getRootQueueMetrics() + .getAggegatedReleasedContainers())) + .td(healthInfo.getAggregateFulFilledReservationsCount().toString()) + .td(healthInfo.getAggregatePreemptionCount().toString()).__().__().__(); + div.h4("Last scheduler run"); + tbody = + div.table("#lastrun").thead().$class("ui-widget-header").tr().th() + .$class("ui-state-default").__("Time").__().th() + .$class("ui-state-default").__("Allocations(count - resources)").__() + .th().$class("ui-state-default").__("Reservations(count - resources)") + .__().th().$class("ui-state-default").__("Releases(count - resources)") + .__().__().__().tbody(); + tbody + .$class("ui-widget-content") + .tr() + .td(Times.format(healthInfo.getLastSchedulerRunTime())) + .td( + healthInfo.getAllocationCount().toString() + " - " + + healthInfo.getResourcesAllocated().toString()) + .td( + healthInfo.getReservationCount().toString() + " - " + + healthInfo.getResourcesReserved().toString()) + .td( + healthInfo.getReleaseCount().toString() + " - " + + healthInfo.getResourcesReleased().toString()).__().__().__(); + Map info = new HashMap<>(); + info.put("Allocation", healthInfo.getLastAllocationDetails()); + info.put("Reservation", healthInfo.getLastReservationDetails()); + info.put("Release", healthInfo.getLastReleaseDetails()); + info.put("Preemption", healthInfo.getLastPreemptionDetails()); + + for (Map.Entry entry : info + .entrySet()) { + String containerId = "N/A"; + String nodeId = "N/A"; + String queue = "N/A"; + String table = "#" + entry.getKey(); + div.h4("Last " + entry.getKey()); + tbody = + div.table(table).thead().$class("ui-widget-header").tr().th() + .$class("ui-state-default").__("Time").__().th() + .$class("ui-state-default").__("Container Id").__().th() + .$class("ui-state-default").__("Node Id").__().th() + .$class("ui-state-default").__("Queue").__().__().__().tbody(); + SchedulerHealth.DetailedInformation di = entry.getValue(); + if (di.getTimestamp() != 0) { + if (di.getContainerId() != null) { + containerId = di.getContainerId().toString(); + } + if (di.getNodeId() != null) { + nodeId = di.getNodeId().toString(); + } + queue = di.getQueue(); + } + tbody.$class("ui-widget-content").tr() + .td(Times.format(di.getTimestamp())).td(containerId).td(nodeId) + .td(queue).__().__().__(); + } + div.__(); + ul.__().__().__(); + } + } static class QueuesBlock extends HtmlBlock { final FairScheduler fs; @@ -184,6 +288,7 @@ @Override public void render(Block html) { html.__(MetricsOverviewTable.class); + html.__(HealthBlock.class); UL>> ul = html. div("#cs-wrapper.ui-widget"). div(".ui-widget-header.ui-corner-top"). @@ -243,6 +348,10 @@ "#cs a { font-weight: normal; margin: 2px; position: relative }", "#cs a span { font-weight: normal; font-size: 80% }", "#cs-wrapper .ui-widget-header { padding: 0.2em 0.5em }", + "#fs-health-wrapper {margin-bottom: 1em }", + "#fs-health {padding: 0.2em }", + "#fs-health-wrapper .ui-widget-header { padding: 0.2em 0.5em; " + + "cursor: pointer; }", ".qstats { font-weight: normal; font-size: 80%; position: absolute }", ".qlegend { font-weight: normal; padding: 0 1em; margin: 1em }", "table.info tr th {width: 50%}").__(). // to center info table @@ -268,6 +377,18 @@ " $('#apps').dataTable().fnFilter(q, 4, true);", " });", " $('#cs').show();", + " if('health' != window.location.hash.substring(1)) { " + + "$('#fs-health').hide();}", + " $(\"#fs-health-wrapper > div.ui-widget-header.ui-corner-top\")" + + ".click(", + " function() {", + " $(\"#fs-health\").slideToggle('slow',", + " function() {", + " window.location.hash = ($(\"#fs-health\").is", + "(\":visible\"))?\"health\":\"\";", + " });", + " }", + " ); ", "});").__(). __(SchedulerPageUtil.QueueBlockUtil.class); }