From 668eec813c4e2154cdec1f450030cda7d7ee457a Mon Sep 17 00:00:00 2001 From: Prabhu Joseph Date: Mon, 27 Apr 2020 18:29:06 +0530 Subject: [PATCH] YARN-10154. Addendum Patch which fixes below bugs 1. RM fails to start when LeafQueueTemplate max capacity is not specified. 2. Job stuck in ACCEPTED state with DominantResourceCalculator as Queue Capacity is set to NaN during RM startup with clusterResource is zero. --- .../scheduler/capacity/ManagedParentQueue.java | 81 ++++++++++++++-------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ManagedParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ManagedParentQueue.java index 2e0e4dd..e47e0c0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ManagedParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ManagedParentQueue.java @@ -192,40 +192,55 @@ private void reinitializeQueueManagementPolicy() throws IOException { * */ if (this.capacityConfigType.equals(CapacityConfigType.ABSOLUTE_RESOURCE)) { - for (String label : queueCapacities.getExistingNodeLabels()) { - queueCapacities.setCapacity(label, - this.csContext.getResourceCalculator().divide( - this.csContext.getClusterResource(), - this.csContext.getConfiguration().getMinimumResourceRequirement( - label, - this.csContext.getConfiguration() - .getAutoCreatedQueueTemplateConfPrefix(getQueuePath()), - resourceTypes), - getQueueResourceQuotas().getConfiguredMinResource(label))); - - queueCapacities.setMaximumCapacity(label, - this.csContext.getResourceCalculator().divide( - this.csContext.getClusterResource(), - this.csContext.getConfiguration().getMaximumResourceRequirement( - label, - this.csContext.getConfiguration() - .getAutoCreatedQueueTemplateConfPrefix(getQueuePath()), - resourceTypes), - getQueueResourceQuotas().getConfiguredMaxResource(label))); - - queueCapacities.setAbsoluteCapacity(label, - queueCapacities.getCapacity(label) - * getQueueCapacities().getAbsoluteCapacity(label)); - - queueCapacities.setAbsoluteMaximumCapacity(label, - queueCapacities.getMaximumCapacity(label) - * getQueueCapacities().getAbsoluteMaximumCapacity(label)); - } + updateQueueCapacities(queueCapacities); } builder.capacities(queueCapacities); return builder; } + private void updateQueueCapacities(QueueCapacities queueCapacities) { + for (String label : queueCapacities.getExistingNodeLabels()) { + queueCapacities.setCapacity(label, + this.csContext.getResourceCalculator().divide( + this.csContext.getClusterResource(), + this.csContext.getConfiguration().getMinimumResourceRequirement( + label, + this.csContext.getConfiguration() + .getAutoCreatedQueueTemplateConfPrefix(getQueuePath()), + resourceTypes), + getQueueResourceQuotas().getConfiguredMinResource(label))); + + Resource childMaxResource = this.csContext.getConfiguration() + .getMaximumResourceRequirement(label, + this.csContext.getConfiguration() + .getAutoCreatedQueueTemplateConfPrefix(getQueuePath()), + resourceTypes); + Resource parentMaxRes = getQueueResourceQuotas() + .getConfiguredMaxResource(label); + + Resource effMaxResource = Resources.min( + this.csContext.getResourceCalculator(), + this.csContext.getClusterResource(), + childMaxResource.equals(Resources.none()) ? parentMaxRes + : childMaxResource, + parentMaxRes); + + queueCapacities.setMaximumCapacity( + label, this.csContext.getResourceCalculator().divide( + this.csContext.getClusterResource(), + effMaxResource, + getQueueResourceQuotas().getConfiguredMaxResource(label))); + + queueCapacities.setAbsoluteCapacity( + label, queueCapacities.getCapacity(label) + * getQueueCapacities().getAbsoluteCapacity(label)); + + queueCapacities.setAbsoluteMaximumCapacity(label, + queueCapacities.getMaximumCapacity(label) + * getQueueCapacities().getAbsoluteMaximumCapacity(label)); + } + } + protected void validate(final CSQueue newlyParsedQueue) throws IOException { // Sanity check if (!(newlyParsedQueue instanceof ManagedParentQueue) || !newlyParsedQueue @@ -276,6 +291,14 @@ public void addChildQueue(CSQueue childQueue) AutoCreatedLeafQueue leafQueue = (AutoCreatedLeafQueue) childQueue; super.addChildQueue(leafQueue); + + /* Below is to avoid Setting Queue Capacity to NaN when ClusterResource + is zero during RM Startup with DominantResourceCalculator */ + if (this.capacityConfigType.equals(CapacityConfigType.ABSOLUTE_RESOURCE)) { + QueueCapacities queueCapacities = getLeafQueueTemplate().getQueueCapacities(); + updateQueueCapacities(queueCapacities); + } + final AutoCreatedLeafQueueConfig initialLeafQueueTemplate = queueManagementPolicy.getInitialLeafQueueConfiguration(leafQueue); -- 2.7.4 (Apple Git-66)