diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java new file mode 100644 index 0000000..b704c89 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java @@ -0,0 +1,226 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.util.resource.ResourceCalculator; +import org.apache.hadoop.yarn.util.resource.Resources; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.PriorityQueue; + +/** + * Calculate how much resources need to be preempted for each queue, + * will be used by {@link PreemptionCandidatesSelector} + */ +public class AbstractPreemptableResourceCalculator { + private static final Log LOG = + LogFactory.getLog(AbstractPreemptableResourceCalculator.class); + + protected final CapacitySchedulerPreemptionContext context; + protected final ResourceCalculator rc; + private boolean isReservedPreemptionCandidatesSelector; + + static class TQComparator implements Comparator { + private ResourceCalculator rc; + private Resource clusterRes; + + TQComparator(ResourceCalculator rc, Resource clusterRes) { + this.rc = rc; + this.clusterRes = clusterRes; + } + + @Override + public int compare(TempQueuePerPartition tq1, TempQueuePerPartition tq2) { + if (getIdealPctOfGuaranteed(tq1) < getIdealPctOfGuaranteed(tq2)) { + return -1; + } + if (getIdealPctOfGuaranteed(tq1) > getIdealPctOfGuaranteed(tq2)) { + return 1; + } + return 0; + } + + // Calculates idealAssigned / guaranteed + // TempQueues with 0 guarantees are always considered the most over + // capacity and therefore considered last for resources. + private double getIdealPctOfGuaranteed(TempQueuePerPartition q) { + double pctOver = Integer.MAX_VALUE; + if (q != null && Resources.greaterThan(rc, clusterRes, + q.getGuaranteed(), + Resources.none())) { + pctOver = Resources.divide(rc, clusterRes, q.idealAssigned, + q.getGuaranteed()); + } + return (pctOver); + } + } + + /** + * PreemptableResourceCalculator constructor + * + * @param preemptionContext + * @param isReservedPreemptionCandidatesSelector this will be set by + * different implementation of candidate selectors, please refer to + * TempQueuePerPartition#offer for details. + * @param priorityBasedPolicy + */ + public AbstractPreemptableResourceCalculator( + CapacitySchedulerPreemptionContext preemptionContext, + boolean isReservedPreemptionCandidatesSelector) { + context = preemptionContext; + rc = preemptionContext.getResourceCalculator(); + this.isReservedPreemptionCandidatesSelector = + isReservedPreemptionCandidatesSelector; + } + + /** + * Given a set of queues compute the fix-point distribution of unassigned + * resources among them. As pending request of a queue are exhausted, the + * queue is removed from the set and remaining capacity redistributed among + * remaining queues. The distribution is weighted based on guaranteed + * capacity, unless asked to ignoreGuarantee, in which case resources are + * distributed uniformly. + */ + protected void computeFixpointAllocation(ResourceCalculator rc, + Resource tot_guarant, Collection qAlloc, + Resource unassigned, boolean ignoreGuarantee) { + // Prior to assigning the unused resources, process each queue as follows: + // If current > guaranteed, idealAssigned = guaranteed + untouchable extra + // Else idealAssigned = current; + // Subtract idealAssigned resources from unassigned. + // If the queue has all of its needs met (that is, if + // idealAssigned >= current + pending), remove it from consideration. + // Sort queues from most under-guaranteed to most over-guaranteed. + TQComparator tqComparator = new TQComparator(rc, tot_guarant); + PriorityQueue orderedByNeed = new PriorityQueue<>(10, + tqComparator); + for (Iterator i = qAlloc.iterator(); i.hasNext();) { + TempQueuePerPartition q = i.next(); + Resource used = q.getUsed(); + + if (Resources.greaterThan(rc, tot_guarant, used, + q.getGuaranteed())) { + q.idealAssigned = Resources.add( + q.getGuaranteed(), q.untouchableExtra); + } else { + q.idealAssigned = Resources.clone(used); + } + Resources.subtractFrom(unassigned, q.idealAssigned); + // If idealAssigned < (allocated + used + pending), q needs more resources, so + // add it to the list of underserved queues, ordered by need. + Resource curPlusPend = Resources.add(q.getUsed(), q.pending); + if (Resources.lessThan(rc, tot_guarant, q.idealAssigned, curPlusPend)) { + orderedByNeed.add(q); + } + } + + //assign all cluster resources until no more demand, or no resources are left + while (!orderedByNeed.isEmpty() + && Resources.greaterThan(rc,tot_guarant, unassigned,Resources.none())) { + Resource wQassigned = Resource.newInstance(0, 0); + // we compute normalizedGuarantees capacity based on currently active + // queues + resetCapacity(rc, unassigned, orderedByNeed, ignoreGuarantee); + + // For each underserved queue (or set of queues if multiple are equally + // underserved), offer its share of the unassigned resources based on its + // normalized guarantee. After the offer, if the queue is not satisfied, + // place it back in the ordered list of queues, recalculating its place + // in the order of most under-guaranteed to most over-guaranteed. In this + // way, the most underserved queue(s) are always given resources first. + Collection underserved = + getMostUnderservedQueues(orderedByNeed, tqComparator); + for (Iterator i = underserved.iterator(); i + .hasNext();) { + TempQueuePerPartition sub = i.next(); + Resource wQavail = Resources.multiplyAndNormalizeUp(rc, + unassigned, sub.normalizedGuarantee, Resource.newInstance(1, 1)); + Resource wQidle = sub.offer(wQavail, rc, tot_guarant, + isReservedPreemptionCandidatesSelector); + Resource wQdone = Resources.subtract(wQavail, wQidle); + + if (Resources.greaterThan(rc, tot_guarant, + wQdone, Resources.none())) { + // The queue is still asking for more. Put it back in the priority + // queue, recalculating its order based on need. + orderedByNeed.add(sub); + } + Resources.addTo(wQassigned, wQdone); + } + Resources.subtractFrom(unassigned, wQassigned); + } + } + + /** + * Computes a normalizedGuaranteed capacity based on active queues + * @param rc resource calculator + * @param clusterResource the total amount of resources in the cluster + * @param queues the list of queues to consider + */ + private void resetCapacity(ResourceCalculator rc, Resource clusterResource, + Collection queues, boolean ignoreGuar) { + Resource activeCap = Resource.newInstance(0, 0); + + if (ignoreGuar) { + for (TempQueuePerPartition q : queues) { + q.normalizedGuarantee = 1.0f / queues.size(); + } + } else { + for (TempQueuePerPartition q : queues) { + Resources.addTo(activeCap, q.getGuaranteed()); + } + for (TempQueuePerPartition q : queues) { + q.normalizedGuarantee = Resources.divide(rc, clusterResource, + q.getGuaranteed(), activeCap); + } + } + } + + // Take the most underserved TempQueue (the one on the head). Collect and + // return the list of all queues that have the same idealAssigned + // percentage of guaranteed. + private Collection getMostUnderservedQueues( + PriorityQueue orderedByNeed, + TQComparator tqComparator) { + ArrayList underserved = new ArrayList<>(); + while (!orderedByNeed.isEmpty()) { + TempQueuePerPartition q1 = orderedByNeed.remove(); + underserved.add(q1); + + // Add underserved queues in order for later uses + context.addPartitionToUnderServedQueues(q1.queueName, q1.partition); + TempQueuePerPartition q2 = orderedByNeed.peek(); + // q1's pct of guaranteed won't be larger than q2's. If it's less, then + // return what has already been collected. Otherwise, q1's pct of + // guaranteed == that of q2, so add q2 to underserved list during the + // next pass. + if (q2 == null || tqComparator.compare(q1,q2) < 0) { + return underserved; + } + } + return underserved; + } + +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionContext.java index c52127d..ba09a9c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionContext.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionContext.java @@ -19,11 +19,14 @@ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; +import java.util.ArrayList; import java.util.Collection; +import java.util.LinkedHashSet; import java.util.Set; interface CapacitySchedulerPreemptionContext { @@ -49,4 +52,16 @@ TempQueuePerPartition getQueueByPartition(String queueName, Set getLeafQueueNames(); Set getAllPartitions(); + + int getClusterMaxApplicationPriority(); + + Resource getPartitionResource(String partition); + + LinkedHashSet getUnderServedQueuesPerPartition(String partition); + + void addPartitionToUnderServedQueues(String queueName, String partition); + + float getMaxIgnoredOverCapacityForIntraQueue(); + + float getMaxAllowablePreemptLimitForIntraQueue(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionUtils.java index 42d8730..dc00414 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionUtils.java @@ -22,10 +22,11 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; import org.apache.hadoop.yarn.util.resource.Resources; +import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.Map; import java.util.Set; @@ -70,8 +71,8 @@ public static void deductPreemptableResourcesBasedSelectedCandidates( Map> selectedCandidates) { for (Set containers : selectedCandidates.values()) { for (RMContainer c : containers) { - SchedulerNode schedulerNode = context.getScheduler().getSchedulerNode( - c.getAllocatedNode()); + SchedulerNode schedulerNode = context.getScheduler() + .getSchedulerNode(c.getAllocatedNode()); if (null == schedulerNode) { continue; } @@ -89,8 +90,44 @@ public static void deductPreemptableResourcesBasedSelectedCandidates( if (null != res) { tq.deductActuallyToBePreempted(context.getResourceCalculator(), tq.totalPartitionResource, res); + Collection tas = tq.getApps(); + if (null == tas || tas.isEmpty()) { + continue; + } + + deductPreemptableResourcePerApp(context, tq.totalPartitionResource, + tas, res, partition); } } } } + + private static void deductPreemptableResourcePerApp( + CapacitySchedulerPreemptionContext context, + Resource totalPartitionResource, Collection tas, + Resource res, String partition) { + // High priority app is coming first + for (TempAppPerQueue ta : tas) { + Resource demand = ta.pendingPerPartition.get(partition); + ta.deductActuallyToBePreempted(context.getResourceCalculator(), + totalPartitionResource, res, partition); + Resources.subtractFrom(demand, res); + } + } + + public static Map getResToObtainByPartitionForApps( + IntraQueuePreemptionPolicy priorityBasedPolicy, + CapacitySchedulerPreemptionContext preemptionContext, LeafQueue leafQueue, + String partition, Resource clusterResource) { + // This map will store per-partition level resource demand (per queue) + HashMap resToObtainByPartition = new HashMap(); + + TempQueuePerPartition tq = preemptionContext + .getQueueByPartition(leafQueue.getQueueName(), partition); + + // Calculate demand from high priority apps per partition level. + priorityBasedPolicy.getResourceDemandFromAppsPerQueue(leafQueue, + tq.getApps(), resToObtainByPartition); + return resToObtainByPartition; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java index 9df395d..59024dd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java @@ -194,16 +194,6 @@ private void preemptAMContainers(Resource clusterResource, skippedAMContainerlist.clear(); } - private boolean preemptMapContains( - Map> preemptMap, - ApplicationAttemptId attemptId, RMContainer rmContainer) { - Set rmContainers; - if (null == (rmContainers = preemptMap.get(attemptId))) { - return false; - } - return rmContainers.contains(rmContainer); - } - /** * Return should we preempt rmContainer. If we should, deduct from * resourceToObtainByPartition @@ -331,35 +321,4 @@ private void preemptFrom(FiCaSchedulerApp app, clusterResource, selectedContainers, totalPreemptionAllowed); } } - - /** - * Compare by reversed priority order first, and then reversed containerId - * order - * @param containers - */ - @VisibleForTesting - static void sortContainers(List containers){ - Collections.sort(containers, new Comparator() { - @Override - public int compare(RMContainer a, RMContainer b) { - int schedKeyComp = b.getAllocatedSchedulerKey() - .compareTo(a.getAllocatedSchedulerKey()); - if (schedKeyComp != 0) { - return schedKeyComp; - } - return b.getContainerId().compareTo(a.getContainerId()); - } - }); - } - - private void addToPreemptMap( - Map> preemptMap, - ApplicationAttemptId appAttemptId, RMContainer containerToPreempt) { - Set set; - if (null == (set = preemptMap.get(appAttemptId))) { - set = new HashSet<>(); - preemptMap.put(appAttemptId, set); - } - set.add(containerToPreempt); - } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueueCandidatesSelector.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueueCandidatesSelector.java new file mode 100644 index 0000000..08136f7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueueCandidatesSelector.java @@ -0,0 +1,219 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.util.resource.Resources; + +import java.util.ArrayList; +import java.util.HashMap; + +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +public class IntraQueueCandidatesSelector extends PreemptionCandidatesSelector { + private static final Log LOG = LogFactory + .getLog(IntraQueueCandidatesSelector.class); + + private IntraQueuePreemptableResourceCalculator intraQPreemptableAmountCalculator; + + IntraQueuePreemptionPolicy priorityBasedPolicy = null; + IntraQueuePreemptionPolicy userLimitBasedPolicy = null; + + IntraQueueCandidatesSelector( + CapacitySchedulerPreemptionContext preemptionContext) { + super(preemptionContext); + priorityBasedPolicy = new PriorityIntraQueuePreemptionPolicy(rc, preemptionContext); + intraQPreemptableAmountCalculator = new IntraQueuePreemptableResourceCalculator( + preemptionContext, true, priorityBasedPolicy); + } + + @Override + public Map> selectCandidates( + Map> selectedCandidates, + Resource clusterResource, Resource totalPreemptedResourceAllowed) { + + // 1. Calculate the abnormality within each queue one by one. + intraQPreemptableAmountCalculator.computeIntraQueuePreemptionDemand( + clusterResource, totalPreemptedResourceAllowed, selectedCandidates); + + // 2. Previous selectors (with higher priority) could have already + // selected containers. We need to deduct preemptable resources + // based on already selected candidates. + CapacitySchedulerPreemptionUtils + .deductPreemptableResourcesBasedSelectedCandidates(preemptionContext, + selectedCandidates); + + // 3. Loop through all partitions to calculate demand + for (String partition : preemptionContext.getAllPartitions()) { + LinkedHashSet queueNames = preemptionContext + .getUnderServedQueuesPerPartition(partition); + + // Error check to handle non-mapped labels to queue. + if (null == queueNames) { + continue; + } + + // 4. Iterate from most under-served queue in order. + for (String queueName : queueNames) { + LeafQueue leafQueue = preemptionContext.getQueueByPartition(queueName, + RMNodeLabelsManager.NO_LABEL).leafQueue; + + // skip if not leafqueue + if(null == leafQueue) { + continue; + } + // 5. Calculate the resource to obtain + Map resToObtainByPartition = CapacitySchedulerPreemptionUtils + .getResToObtainByPartitionForApps(priorityBasedPolicy, + preemptionContext, leafQueue, partition, clusterResource); + + synchronized (leafQueue) { + // 6. if more resources are to be freed, go through all live + // containers in reverse priority and reverse allocation order and + // mark them for preemption + Iterator desc = leafQueue.getOrderingPolicy() + .getPreemptionIterator(); + while (desc.hasNext()) { + FiCaSchedulerApp app = desc.next(); + preemptFromLeastStarvedApp(selectedCandidates, clusterResource, + totalPreemptedResourceAllowed, resToObtainByPartition, + leafQueue, app); + } + } + } + } + + return selectedCandidates; + } + + private void preemptFromLeastStarvedApp( + Map> selectedCandidates, + Resource clusterResource, Resource totalPreemptedResourceAllowed, + Map resToObtainByPartition, LeafQueue leafQueue, FiCaSchedulerApp app) { + + // ToDo: + // 1. Reuse reservation selector here. + + List liveContainers = new ArrayList<>( + app.getLiveContainers()); + sortContainers(liveContainers); + + for (RMContainer c : liveContainers) { + + // if there are no demand, return. + if (resToObtainByPartition.isEmpty()) { + return; + } + + String nodePartition = getPartitionByNodeId(c.getAllocatedNode()); + Resource toObtainByPartition = resToObtainByPartition.get(nodePartition); + + // When we have no more resource need to obtain, remove from map. + if (Resources.lessThanOrEqual(rc, clusterResource, toObtainByPartition, + Resources.none())) { + resToObtainByPartition.remove(nodePartition); + } + + // skip preselected containers. + if (CapacitySchedulerPreemptionUtils.isContainerAlreadySelected(c, + selectedCandidates)) { + continue; + } + + // Skip already marked to killable containers + if (null != preemptionContext.getKillableContainers() && preemptionContext + .getKillableContainers().contains(c.getContainerId())) { + continue; + } + + // Skip AM Container from preemption for now. + // ToDo: Need to decide whether intra preemption need to take AM containers + // or not. + if (c.isAMContainer()) { + continue; + } + + // Try to preempt this container + tryPreemptContainerAndDeductResToObtain(toObtainByPartition, c, + clusterResource, selectedCandidates, totalPreemptedResourceAllowed); + } + } + + private String getPartitionByNodeId(NodeId nodeId) { + return preemptionContext.getScheduler().getSchedulerNode(nodeId) + .getPartition(); + } + + + /** + * Return should we preempt rmContainer. If we should, deduct from + * resourceToObtainByPartition + */ + private boolean tryPreemptContainerAndDeductResToObtain( + Resource actualPreemptNeeded, RMContainer rmContainer, + Resource clusterResource, + Map> preemptMap, + Resource totalPreemptionAllowed) { + ApplicationAttemptId attemptId = rmContainer.getApplicationAttemptId(); + + // We will not account resource of a container twice or more + if (preemptMapContains(preemptMap, attemptId, rmContainer)) { + return false; + } + + Resource toObtainByPartition = actualPreemptNeeded; + + if (null != toObtainByPartition + && Resources.greaterThan(rc, clusterResource, toObtainByPartition, + Resources.none()) + && Resources.fitsIn(rc, clusterResource, + rmContainer.getAllocatedResource(), totalPreemptionAllowed)) { + Resources.subtractFrom(toObtainByPartition, + rmContainer.getAllocatedResource()); + Resources.subtractFrom(totalPreemptionAllowed, + rmContainer.getAllocatedResource()); + + if (LOG.isDebugEnabled()) { + LOG.debug(this.getClass().getName() + " Marked container=" + + rmContainer.getContainerId() + " queue=" + + rmContainer.getQueueName() + " to be preemption candidates"); + } + // Add to preemptMap + addToPreemptMap(preemptMap, attemptId, rmContainer); + return true; + } + + return false; + } + + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueuePreemptableResourceCalculator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueuePreemptableResourceCalculator.java new file mode 100644 index 0000000..b37755e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueuePreemptableResourceCalculator.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.util.resource.Resources; + +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; + +/** + * Calculate how much resources need to be preempted for each queue, + * will be used by {@link PreemptionCandidatesSelector} + */ +public class IntraQueuePreemptableResourceCalculator + extends + AbstractPreemptableResourceCalculator { + + protected IntraQueuePreemptionPolicy priorityBasedPolicy; + + static class TAPriorityComparator implements Comparator { + + @Override + public int compare(TempAppPerQueue tq1, TempAppPerQueue tq2) { + if (tq1.getPriority() < tq2.getPriority()) { + return 1; + } + if (tq1.getPriority() > tq2.getPriority()) { + return -1; + } + return tq1.getApplicationId().compareTo(tq2.getApplicationId()); + } + } + + static class TAReverseComparator implements Comparator { + + @Override + public int compare(TempAppPerQueue tq1, TempAppPerQueue tq2) { + if (tq1.getPriority() > tq2.getPriority()) { + return 1; + } + if (tq1.getPriority() < tq2.getPriority()) { + return -1; + } + return tq2.getApplicationId().compareTo(tq1.getApplicationId()); + } + } + /** + * PreemptableResourceCalculator constructor + * + * @param preemptionContext + * @param isReservedPreemptionCandidatesSelector + * this will be set by different implementation of candidate + * selectors, please refer to TempQueuePerPartition#offer for + * details. + * @param priorityBasedPolicy + */ + public IntraQueuePreemptableResourceCalculator( + CapacitySchedulerPreemptionContext preemptionContext, + boolean isReservedPreemptionCandidatesSelector, + IntraQueuePreemptionPolicy priorityBasedPolicy) { + super(preemptionContext, isReservedPreemptionCandidatesSelector); + this.priorityBasedPolicy = priorityBasedPolicy; + } + + public void computeIntraQueuePreemptionDemand(Resource clusterResource, + Resource totalPreemptedResourceAllowed, + Map> selectedCandidates) { + + for (String partition : context.getAllPartitions()) { + LinkedHashSet queueNames = context + .getUnderServedQueuesPerPartition(partition); + + if (null == queueNames) { + continue; + } + + for (String queueName : queueNames) { + TempQueuePerPartition tq = context.getQueueByPartition(queueName, + partition); + LeafQueue leafQueue = tq.leafQueue; + + // skipp if its parent queue + if (null == leafQueue) { + continue; + } + tq.setUnAllocated( + Resources.subtract(tq.getGuaranteed(), tq.getActuallyToBePreempted())); + + Collection apps = leafQueue.getAllApplications(); + + if (apps.size() == 1) { + // We do not need preemption for a single app + continue; + } + + // Check queue's used capacity. Make sure that the used capacity is + // above certain limit to consider for intra queue preemption. + if (leafQueue.getUsedCapacity() < context + .getMaxIgnoredOverCapacityForIntraQueue()) { + continue; + } + + // compute the allocation of all apps based on queue's unallocated + // capacity + priorityBasedPolicy.computeAppsIdealAllocation(rc, clusterResource, + apps, tq, selectedCandidates, totalPreemptedResourceAllowed, + context.getMaxAllowablePreemptLimitForIntraQueue()); + } + } + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueuePreemptionPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueuePreemptionPolicy.java new file mode 100644 index 0000000..d176301 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueuePreemptionPolicy.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.util.resource.ResourceCalculator; + +interface IntraQueuePreemptionPolicy { + + void getResourceDemandFromAppsPerQueue(LeafQueue leafQueue, + Collection orderedApps, + Map resToObtainByPartition); + + void computeAppsIdealAllocation(ResourceCalculator rc, + Resource clusterResource, Collection apps, + TempQueuePerPartition tq, + Map> selectedCandidates, + Resource totalPreemptedResourceAllowed, float maxAllowablePreemptLimit); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptableResourceCalculator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptableResourceCalculator.java index d1d2485..7bd3053 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptableResourceCalculator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptableResourceCalculator.java @@ -27,61 +27,23 @@ import org.apache.hadoop.yarn.util.resource.Resources; import java.util.ArrayList; -import java.util.Collection; import java.util.Comparator; import java.util.HashSet; -import java.util.Iterator; import java.util.List; -import java.util.PriorityQueue; import java.util.Set; /** * Calculate how much resources need to be preempted for each queue, * will be used by {@link PreemptionCandidatesSelector} */ -public class PreemptableResourceCalculator { +public class PreemptableResourceCalculator + extends + AbstractPreemptableResourceCalculator { private static final Log LOG = LogFactory.getLog(PreemptableResourceCalculator.class); - private final CapacitySchedulerPreemptionContext context; - private final ResourceCalculator rc; private boolean isReservedPreemptionCandidatesSelector; - static class TQComparator implements Comparator { - private ResourceCalculator rc; - private Resource clusterRes; - - TQComparator(ResourceCalculator rc, Resource clusterRes) { - this.rc = rc; - this.clusterRes = clusterRes; - } - - @Override - public int compare(TempQueuePerPartition tq1, TempQueuePerPartition tq2) { - if (getIdealPctOfGuaranteed(tq1) < getIdealPctOfGuaranteed(tq2)) { - return -1; - } - if (getIdealPctOfGuaranteed(tq1) > getIdealPctOfGuaranteed(tq2)) { - return 1; - } - return 0; - } - - // Calculates idealAssigned / guaranteed - // TempQueues with 0 guarantees are always considered the most over - // capacity and therefore considered last for resources. - private double getIdealPctOfGuaranteed(TempQueuePerPartition q) { - double pctOver = Integer.MAX_VALUE; - if (q != null && Resources.greaterThan(rc, clusterRes, - q.getGuaranteed(), - Resources.none())) { - pctOver = Resources.divide(rc, clusterRes, q.idealAssigned, - q.getGuaranteed()); - } - return (pctOver); - } - } - /** * PreemptableResourceCalculator constructor * @@ -93,136 +55,7 @@ private double getIdealPctOfGuaranteed(TempQueuePerPartition q) { public PreemptableResourceCalculator( CapacitySchedulerPreemptionContext preemptionContext, boolean isReservedPreemptionCandidatesSelector) { - context = preemptionContext; - rc = preemptionContext.getResourceCalculator(); - this.isReservedPreemptionCandidatesSelector = - isReservedPreemptionCandidatesSelector; - } - - /** - * Computes a normalizedGuaranteed capacity based on active queues - * @param rc resource calculator - * @param clusterResource the total amount of resources in the cluster - * @param queues the list of queues to consider - */ - private void resetCapacity(ResourceCalculator rc, Resource clusterResource, - Collection queues, boolean ignoreGuar) { - Resource activeCap = Resource.newInstance(0, 0); - - if (ignoreGuar) { - for (TempQueuePerPartition q : queues) { - q.normalizedGuarantee = 1.0f / queues.size(); - } - } else { - for (TempQueuePerPartition q : queues) { - Resources.addTo(activeCap, q.getGuaranteed()); - } - for (TempQueuePerPartition q : queues) { - q.normalizedGuarantee = Resources.divide(rc, clusterResource, - q.getGuaranteed(), activeCap); - } - } - } - - // Take the most underserved TempQueue (the one on the head). Collect and - // return the list of all queues that have the same idealAssigned - // percentage of guaranteed. - protected Collection getMostUnderservedQueues( - PriorityQueue orderedByNeed, - TQComparator tqComparator) { - ArrayList underserved = new ArrayList<>(); - while (!orderedByNeed.isEmpty()) { - TempQueuePerPartition q1 = orderedByNeed.remove(); - underserved.add(q1); - TempQueuePerPartition q2 = orderedByNeed.peek(); - // q1's pct of guaranteed won't be larger than q2's. If it's less, then - // return what has already been collected. Otherwise, q1's pct of - // guaranteed == that of q2, so add q2 to underserved list during the - // next pass. - if (q2 == null || tqComparator.compare(q1,q2) < 0) { - return underserved; - } - } - return underserved; - } - - - /** - * Given a set of queues compute the fix-point distribution of unassigned - * resources among them. As pending request of a queue are exhausted, the - * queue is removed from the set and remaining capacity redistributed among - * remaining queues. The distribution is weighted based on guaranteed - * capacity, unless asked to ignoreGuarantee, in which case resources are - * distributed uniformly. - */ - private void computeFixpointAllocation(ResourceCalculator rc, - Resource tot_guarant, Collection qAlloc, - Resource unassigned, boolean ignoreGuarantee) { - // Prior to assigning the unused resources, process each queue as follows: - // If current > guaranteed, idealAssigned = guaranteed + untouchable extra - // Else idealAssigned = current; - // Subtract idealAssigned resources from unassigned. - // If the queue has all of its needs met (that is, if - // idealAssigned >= current + pending), remove it from consideration. - // Sort queues from most under-guaranteed to most over-guaranteed. - TQComparator tqComparator = new TQComparator(rc, tot_guarant); - PriorityQueue orderedByNeed = new PriorityQueue<>(10, - tqComparator); - for (Iterator i = qAlloc.iterator(); i.hasNext();) { - TempQueuePerPartition q = i.next(); - Resource used = q.getUsed(); - - if (Resources.greaterThan(rc, tot_guarant, used, - q.getGuaranteed())) { - q.idealAssigned = Resources.add( - q.getGuaranteed(), q.untouchableExtra); - } else { - q.idealAssigned = Resources.clone(used); - } - Resources.subtractFrom(unassigned, q.idealAssigned); - // If idealAssigned < (allocated + used + pending), q needs more resources, so - // add it to the list of underserved queues, ordered by need. - Resource curPlusPend = Resources.add(q.getUsed(), q.pending); - if (Resources.lessThan(rc, tot_guarant, q.idealAssigned, curPlusPend)) { - orderedByNeed.add(q); - } - } - - //assign all cluster resources until no more demand, or no resources are left - while (!orderedByNeed.isEmpty() - && Resources.greaterThan(rc,tot_guarant, unassigned,Resources.none())) { - Resource wQassigned = Resource.newInstance(0, 0); - // we compute normalizedGuarantees capacity based on currently active - // queues - resetCapacity(rc, unassigned, orderedByNeed, ignoreGuarantee); - - // For each underserved queue (or set of queues if multiple are equally - // underserved), offer its share of the unassigned resources based on its - // normalized guarantee. After the offer, if the queue is not satisfied, - // place it back in the ordered list of queues, recalculating its place - // in the order of most under-guaranteed to most over-guaranteed. In this - // way, the most underserved queue(s) are always given resources first. - Collection underserved = - getMostUnderservedQueues(orderedByNeed, tqComparator); - for (Iterator i = underserved.iterator(); i - .hasNext();) { - TempQueuePerPartition sub = i.next(); - Resource wQavail = Resources.multiplyAndNormalizeUp(rc, - unassigned, sub.normalizedGuarantee, Resource.newInstance(1, 1)); - Resource wQidle = sub.offer(wQavail, rc, tot_guarant, - isReservedPreemptionCandidatesSelector); - Resource wQdone = Resources.subtract(wQavail, wQidle); - - if (Resources.greaterThan(rc, tot_guarant, - wQdone, Resources.none())) { - // The queue is still asking for more. Put it back in the priority - // queue, recalculating its order based on need. - orderedByNeed.add(sub); - } - Resources.addTo(wQassigned, wQdone); - } - Resources.subtractFrom(unassigned, wQassigned); - } + super(preemptionContext, isReservedPreemptionCandidatesSelector); } /** @@ -321,13 +154,12 @@ private void recursivelyComputeIdealAssignment( computeIdealResourceDistribution(rc, root.getChildren(), totalPreemptionAllowed, root.idealAssigned); // compute recursively for lower levels and build list of leafs - for(TempQueuePerPartition t : root.getChildren()) { + for (TempQueuePerPartition t : root.getChildren()) { recursivelyComputeIdealAssignment(t, totalPreemptionAllowed); } } } - private void calculateResToObtainByPartitionForLeafQueues( Set leafQueueNames, Resource clusterResource) { // Loop all leaf queues diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java index dd33d8f..36b4961 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java @@ -23,6 +23,12 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; +import com.google.common.annotations.VisibleForTesting; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -49,4 +55,49 @@ public abstract Map> selectCandidates( Map> selectedCandidates, Resource clusterResource, Resource totalPreemptedResourceAllowed); + + /** + * Compare by reversed priority order first, and then reversed containerId + * order + * + * @param containers + */ + // ToDo: reuse form FiFoCandidatesSelector + @VisibleForTesting + static void sortContainers(List containers) { + Collections.sort(containers, new Comparator() { + @Override + public int compare(RMContainer a, RMContainer b) { + int schedKeyComp = b.getAllocatedSchedulerKey() + .compareTo(a.getAllocatedSchedulerKey()); + if (schedKeyComp != 0) { + return schedKeyComp; + } + return b.getContainerId().compareTo(a.getContainerId()); + } + }); + } + + // ToDo: reuse form FiFoCandidatesSelector + protected void addToPreemptMap( + Map> preemptMap, + ApplicationAttemptId appAttemptId, RMContainer containerToPreempt) { + Set set; + if (null == (set = preemptMap.get(appAttemptId))) { + set = new HashSet<>(); + preemptMap.put(appAttemptId, set); + } + set.add(containerToPreempt); + } + + // ToDo: reuse form FiFoCandidatesSelector + protected boolean preemptMapContains( + Map> preemptMap, + ApplicationAttemptId attemptId, RMContainer rmContainer) { + Set rmContainers; + if (null == (rmContainers = preemptMap.get(attemptId))) { + return false; + } + return rmContainers.contains(rmContainer); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PriorityIntraQueuePreemptionPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PriorityIntraQueuePreemptionPolicy.java new file mode 100644 index 0000000..a78d1c9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PriorityIntraQueuePreemptionPolicy.java @@ -0,0 +1,241 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.IntraQueuePreemptableResourceCalculator.TAPriorityComparator; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.IntraQueuePreemptableResourceCalculator.TAReverseComparator; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.util.resource.ResourceCalculator; +import org.apache.hadoop.yarn.util.resource.Resources; + +public class PriorityIntraQueuePreemptionPolicy + implements + IntraQueuePreemptionPolicy { + + protected final CapacitySchedulerPreemptionContext context; + protected final ResourceCalculator rc; + + public PriorityIntraQueuePreemptionPolicy(ResourceCalculator rc, + CapacitySchedulerPreemptionContext preemptionContext) { + this.context = preemptionContext; + this.rc = rc; + } + + @Override + public void getResourceDemandFromAppsPerQueue(LeafQueue leafQueue, + Collection appsOrderedByPriority, + Map resToObtainByPartition) { + + Resource actualPreemptNeeded = null; + + for (TempAppPerQueue a1 : appsOrderedByPriority) { + for (Entry entry : a1.getPendingPerPartition() + .entrySet()) { + String partition = entry.getKey(); + + // Can skip apps which are already crossing user-limit. + // For this, Get the userlimit from scheduler and ensure that app is + // not crossing userlimit here. Such apps can be skipped. + Resource userHeadroom = leafQueue.getUserLimitHeadRoomPerApp( + a1.getFiCaSchedulerApp(), context.getPartitionResource(partition), + partition); + if (Resources.lessThanOrEqual(rc, + context.getPartitionResource(partition), userHeadroom, + Resources.none())) { + continue; + } + + // Updating pending resource per-partition level. + if ((actualPreemptNeeded = resToObtainByPartition + .get(partition)) == null) { + actualPreemptNeeded = Resources.createResource(0, 0); + resToObtainByPartition.put(partition, actualPreemptNeeded); + } + Resources.addTo(actualPreemptNeeded, a1.toBePreempted); + } + } + } + + @Override + public void computeAppsIdealAllocation(ResourceCalculator rc, + Resource clusterResource, Collection apps, + TempQueuePerPartition tq, + Map> selectedCandidates, + Resource totalPreemptedResourceAllowed, float maxAllowablePreemptLimit) { + + TAPriorityComparator taComparator = new TAPriorityComparator(); + PriorityQueue orderedByPriority = createTempAppForResourceCalculcation( + rc, apps, taComparator); + + // Apps ordered from highest to lower priority. + ArrayList orderedApps = getHighPriorityApps( + orderedByPriority, taComparator); + for (TempAppPerQueue tmpApp : orderedApps) { + // Once unallocated resource is 0, we can stop assigning ideal per app. + if (Resources.lessThanOrEqual(rc, clusterResource, tq.unAllocated, + Resources.none())) { + break; + } + + // Calculate total selected container size from current app. + Resource selected = getSelectedResourcePerApp(selectedCandidates, tmpApp); + + // For any app, used+pending will give its idealAssigned. However it will + // be tightly linked to queue's unallocated quota. So lower priority apps + // idealAssigned may fall to 0 if higher priority apps demand is more. + Resource initialIdealAssigned = Resources.add(tmpApp.getUsed(), + tmpApp.pending); + Resources.subtractFrom(initialIdealAssigned, selected); + tmpApp.idealAssigned = Resources.min(rc, clusterResource, tq.unAllocated, + initialIdealAssigned); + + Resources.subtractFrom(tq.unAllocated, tmpApp.idealAssigned); + } + + // A configurable limit could define an ideal allowable preemption limit. + // Based on current queue's capcity, defined how much % could become + // preemptable. + Resource maxIntraQueuePreemptable = Resources.multiply(tq.getGuaranteed(), + maxAllowablePreemptLimit); + Resources.subtractFrom(maxIntraQueuePreemptable, + tq.getActuallyToBePreempted()); + + // WE have two configurations here, one is intra queue limit and second one + // is per-round limit for any time preemption. Take a minimum of these two + Resource preemptionLimit = Resources.min(rc, clusterResource, + maxIntraQueuePreemptable, totalPreemptedResourceAllowed); + + // Re-sort the collection to get apps from lower to high priority to + // calculate preemptable resource per app. + Collections.sort(orderedApps, new TAReverseComparator()); + + for (TempAppPerQueue tmpApp : orderedApps) { + if (Resources.lessThanOrEqual(rc, clusterResource, preemptionLimit, + Resources.none())) { + break; + } + + Resource preemtableFromApp = Resources.subtract(tmpApp.getUsed(), + tmpApp.idealAssigned); + Resources.subtractFrom(preemtableFromApp, tmpApp.selected); + + // Calculate toBePreempted from apps as follows: + // app.preemptable = min(max(app.used - app.selected - app.ideal, 0), + // intra_q_preemptable) + tmpApp.toBePreempted = Resources.min(rc, clusterResource, Resources + .max(rc, clusterResource, preemtableFromApp, Resources.none()), + preemptionLimit); + preemptionLimit = Resources.subtract(preemptionLimit, + tmpApp.toBePreempted); + } + + // Save all apps to temp queue for further reference + tq.addAllApps(orderedApps); + } + + // Take the most underserved TempQueue (the one on the head). Collect and + // return the list of all queues that have the same idealAssigned + // percentage of guaranteed. + private ArrayList getHighPriorityApps( + PriorityQueue orderedByNeed, + TAPriorityComparator taComparator) { + ArrayList underserved = new ArrayList<>(); + while (!orderedByNeed.isEmpty()) { + TempAppPerQueue a1 = orderedByNeed.remove(); + underserved.add(a1); + TempAppPerQueue a2 = orderedByNeed.peek(); + // q1's pct of guaranteed won't be larger than q2's. If it's less, then + // return what has already been collected. Otherwise, q1's pct of + // guaranteed == that of q2, so add q2 to underserved list during the + // next pass. + if (a2 == null || taComparator.compare(a1, a2) > 0) { + return underserved; + } + } + + return underserved; + } + + private Resource getSelectedResourcePerApp( + Map> selectedCandidates, + TempAppPerQueue tmpApp) { + tmpApp.selected = Resources.createResource(0, 0); + Set containers = selectedCandidates + .get(tmpApp.app.getApplicationAttemptId()); + + if (containers == null) { + return tmpApp.selected; + } + for (RMContainer cont : containers) { + Resources.addTo(tmpApp.selected, cont.getAllocatedResource()); + } + + return tmpApp.selected; + } + + private PriorityQueue createTempAppForResourceCalculcation( + ResourceCalculator rc, Collection apps, + TAPriorityComparator taComparator) { + PriorityQueue orderedByPriority = new PriorityQueue<>(100, + taComparator); + + // have an internal temp app structure to store intermediate data (priority) + for (FiCaSchedulerApp app : apps) { + + Set partitions = app.getAppAttemptResourceUsage() + .getNodePartitionsSet(); + HashMap usedPerPartition = new HashMap(); + for (String partition : partitions) { + usedPerPartition.put(partition, + app.getAppAttemptResourceUsage().getUsed(partition)); + } + + // Create TempAppPerQueue for further calculation. + TempAppPerQueue tmpApp = new TempAppPerQueue(app.getQueueName(), + app.getApplicationId(), usedPerPartition, app.getCurrentReservation(), + app.getTotalPendingRequests(), + (HashMap) app.getTotalPendingRequestsPerPartition(), + app.getPriority().getPriority(), app); + + // Set ideal allocation of app as 0. + tmpApp.idealAssigned = Resources.createResource(0, 0); + + // Skip an app which doesn't have any outstanding resource requests + if (Resources.lessThanOrEqual(rc, Resources.none(), + app.getTotalPendingRequests(), Resources.none())) { + continue; + } + orderedByPriority.add(tmpApp); + } + return orderedByPriority; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java index 36383502..9b462a1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java @@ -52,6 +52,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -91,6 +92,9 @@ private boolean observeOnly; private boolean lazyPreempionEnabled; + private float maxAllowablePreemptLimitForIntraQueue; + private float maxIgnoredOverCapacityForIntraQueue; + // Pointer to other RM components private RMContext rmContext; private ResourceCalculator rc; @@ -102,6 +106,8 @@ new HashMap<>(); private Map> queueToPartitions = new HashMap<>(); + private Map> partitionToUnderServedQueues = + new HashMap>(); private List candidatesSelectionPolicies = new ArrayList<>(); private Set allPartitions; @@ -171,6 +177,14 @@ public void init(Configuration config, RMContext context, CapacitySchedulerConfiguration.LAZY_PREEMPTION_ENALBED, CapacitySchedulerConfiguration.DEFAULT_LAZY_PREEMPTION_ENABLED); + maxAllowablePreemptLimitForIntraQueue = csConfig.getFloat( + CapacitySchedulerConfiguration.MAX_ALLOWABLE_PREEMPTION_LIMIT_FOR_INTRA_QUEUE, + CapacitySchedulerConfiguration.DEFAULT_MAX_ALLOWABLE_PREEMPTION_LIMIT_FOR_INTRA_QUEUE); + + maxIgnoredOverCapacityForIntraQueue = csConfig.getFloat( + CapacitySchedulerConfiguration.MAX_IGNORED_OVER_CAPACITY_FOR_INTRA_QUEUE, + CapacitySchedulerConfiguration.DEFAULT_MAX_IGNORED_OVER_CAPACITY_FOR_INTRA_QUEUE); + rc = scheduler.getResourceCalculator(); nlm = scheduler.getRMContext().getNodeLabelManager(); @@ -186,8 +200,16 @@ public void init(Configuration config, RMContext context, // initialize candidates preemption selection policies candidatesSelectionPolicies.add( new FifoCandidatesSelector(this)); + + // Do we need to specially consider intra queue + boolean selectIntraQueuePreemptCandidatesByPriority = csConfig.getBoolean( + CapacitySchedulerConfiguration.SELECT_CANDIDATES_FOR_INTRAQUEUE_PREEMPTION, + CapacitySchedulerConfiguration.DEFAULT_SELECT_CANDIDATES_FOR_INTRAQUEUE_PREEMPTION); + if (selectIntraQueuePreemptCandidatesByPriority) { + candidatesSelectionPolicies.add(new IntraQueueCandidatesSelector(this)); + } } - + @Override public ResourceCalculator getResourceCalculator() { return rc; @@ -234,6 +256,8 @@ private void preemptOrkillSelectedContainerAfterWait( // not have to raise another event. continue; } + + System.out.println("App:" + container.getApplicationAttemptId() + ", container id:" + container.getContainerId() ); //otherwise just send preemption events rmContext.getDispatcher().getEventHandler().handle( new ContainerPreemptEvent(appAttemptId, container, @@ -542,4 +566,40 @@ public double getNaturalTerminationFactor() { Map> getQueuePartitions() { return queueToPartitions; } + + @Override + public int getClusterMaxApplicationPriority() { + return scheduler.getMaxClusterLevelAppPriority().getPriority(); + } + + @Override + public float getMaxAllowablePreemptLimitForIntraQueue() { + return maxAllowablePreemptLimitForIntraQueue; + } + + @Override + public float getMaxIgnoredOverCapacityForIntraQueue() { + return maxIgnoredOverCapacityForIntraQueue; + } + + @Override + public Resource getPartitionResource(String partition) { + return Resources.clone(nlm.getResourceByLabel(partition, + Resources.clone(scheduler.getClusterResource()))); + } + + public LinkedHashSet getUnderServedQueuesPerPartition(String partition) { + return partitionToUnderServedQueues.get(partition); + } + + public void addPartitionToUnderServedQueues(String queueName, + String partition) { + LinkedHashSet underServedQueues; + if (null == (underServedQueues = partitionToUnderServedQueues + .get(partition))) { + underServedQueues = new LinkedHashSet(); + partitionToUnderServedQueues.put(partition, underServedQueues); + } + underServedQueues.add(queueName); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempAppPerQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempAppPerQueue.java new file mode 100644 index 0000000..0191d7d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempAppPerQueue.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.util.resource.ResourceCalculator; +import org.apache.hadoop.yarn.util.resource.Resources; + +import java.util.HashMap; + +/** + * Temporary data-structure tracking resource availability, pending resource + * need, current utilization for an application. + */ +public class TempAppPerQueue { + // Following fields are copied from scheduler + final String queueName; + final Resource pending; + final HashMap pendingPerPartition; + private final HashMap usedPerPartition; + private final Resource reserved; + + // Following fields are settled and used by candidate selection policies + Resource idealAssigned; + Resource toBePreempted; + Resource selected; + private Resource actuallyToBePreempted; + + private final int priority; + private final ApplicationId applicationId; + LeafQueue leafQueue; + FiCaSchedulerApp app; + + TempAppPerQueue(String queueName, ApplicationId applicationId, + HashMap usedPerPartition, Resource reserved, + Resource pending, HashMap pendingPerPartition, + int priority, FiCaSchedulerApp app) { + this.queueName = queueName; + this.usedPerPartition = usedPerPartition; + this.pending = pending; + this.pendingPerPartition = pendingPerPartition; + + this.idealAssigned = Resource.newInstance(0, 0); + this.actuallyToBePreempted = Resource.newInstance(0, 0); + this.toBePreempted = Resource.newInstance(0, 0); + this.selected = Resource.newInstance(0, 0); + + this.reserved = reserved; + this.priority = priority; + this.applicationId = applicationId; + this.app = app; + } + + public void setLeafQueue(LeafQueue l) { + this.leafQueue = l; + } + + public Resource getUsed() { + return usedPerPartition.get(""); + } + + public Resource getUsed(String partition) { + return usedPerPartition.get(partition); + } + + public HashMap getPendingPerPartition() { + return pendingPerPartition; + } + + public FiCaSchedulerApp getFiCaSchedulerApp() { + return app; + } + + public Resource getGuaranteed() { + return Resources.none(); + } + + public void updateDemand(Resource demand, String partition) { + Resource perPartitionDemand = pendingPerPartition.get(partition); + + if (perPartitionDemand == null) { + // Should not happen. + return; + } + + Resources.subtractFrom(perPartitionDemand, demand); + } + + public void assignPreemption(Resource killable) { + Resources.addTo(toBePreempted, killable); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(" NAME: " + getApplicationId()).append(" CUR: ").append(getUsed()) + .append(" PEN: ").append(pending).append(" RESERVED: ").append(reserved) + .append(" IDEAL_ASSIGNED: ").append(idealAssigned) + .append(" IDEAL_PREEMPT: ").append(toBePreempted) + .append(" ACTUAL_PREEMPT: ").append(actuallyToBePreempted).append("\n"); + + return sb.toString(); + } + + public Resource getActuallyToBePreempted() { + return actuallyToBePreempted; + } + + public void setActuallyToBePreempted(Resource res) { + this.actuallyToBePreempted = res; + } + + void appendLogString(StringBuilder sb) { + sb.append(queueName).append(", ").append(getUsed().getMemorySize()) + .append(", ").append(getUsed().getVirtualCores()).append(", ") + .append(pending.getMemorySize()).append(", ") + .append(pending.getVirtualCores()).append(", ") + .append(getGuaranteed().getMemorySize()).append(", ") + .append(getGuaranteed().getVirtualCores()).append(", ") + .append(idealAssigned.getMemorySize()).append(", ") + .append(idealAssigned.getVirtualCores()).append(", ") + .append(toBePreempted.getMemorySize()).append(", ") + .append(toBePreempted.getVirtualCores()).append(", ") + .append(actuallyToBePreempted.getMemorySize()).append(", ") + .append(actuallyToBePreempted.getVirtualCores()); + } + + public int getPriority() { + return priority; + } + + public ApplicationId getApplicationId() { + return applicationId; + } + + public void deductActuallyToBePreempted(ResourceCalculator resourceCalculator, + Resource toBeDeduct, Resource cluster, String partition) { + Resource pending = pendingPerPartition.get(partition); + if (Resources.greaterThan(resourceCalculator, cluster, pending, + toBeDeduct)) { + Resources.subtractFrom(pending, toBeDeduct); + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java index 04ed135..0f98508 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java @@ -25,6 +25,7 @@ import org.apache.hadoop.yarn.util.resource.Resources; import java.util.ArrayList; +import java.util.Collection; /** * Temporary data-structure tracking resource availability, pending resource @@ -48,11 +49,15 @@ Resource toBePreempted; Resource untouchableExtra; Resource preemptableExtra; + Resource unAllocated; private Resource actuallyToBePreempted; + Resource selectedContainers; double normalizedGuarantee; + boolean intraQueuePreemptionCalculationDone; final ArrayList children; + private Collection apps; LeafQueue leafQueue; boolean preemptionDisabled; @@ -77,6 +82,7 @@ this.toBePreempted = Resource.newInstance(0, 0); this.normalizedGuarantee = Float.NaN; this.children = new ArrayList<>(); + this.apps = new ArrayList<>(); this.untouchableExtra = Resource.newInstance(0, 0); this.preemptableExtra = Resource.newInstance(0, 0); this.preemptionDisabled = preemptionDisabled; @@ -86,6 +92,7 @@ this.absMaxCapacity = absMaxCapacity; this.totalPartitionResource = totalPartitionResource; this.reserved = reserved; + this.intraQueuePreemptionCalculationDone = false; } public void setLeafQueue(LeafQueue l) { @@ -247,6 +254,14 @@ public void setActuallyToBePreempted(Resource res) { this.actuallyToBePreempted = res; } + public Resource getUnAllocated() { + return unAllocated; + } + + public void setUnAllocated(Resource res) { + this.unAllocated = res; + } + public void deductActuallyToBePreempted(ResourceCalculator rc, Resource cluster, Resource toBeDeduct) { if (Resources.greaterThan(rc, cluster, actuallyToBePreempted, toBeDeduct)) { @@ -272,4 +287,11 @@ void appendLogString(StringBuilder sb) { .append(actuallyToBePreempted.getVirtualCores()); } + public void addAllApps(Collection orderedApps) { + this.apps = orderedApps; + } + + public Collection getApps() { + return apps; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java index d5d1374..4bc12fc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java @@ -1077,4 +1077,29 @@ public boolean getLazyPreemptionEnabled() { PREEMPTION_CONFIG_PREFIX + "select_based_on_reserved_containers"; public static final boolean DEFAULT_PREEMPTION_SELECT_CANDIDATES_FOR_RESERVED_CONTAINERS = false; + + /** + * For intra-queue preemption, priority based selector can help to preempt + * containers of lowest priority apps to find resources for high priority + * apps. + */ + public static final String SELECT_CANDIDATES_FOR_INTRAQUEUE_PREEMPTION = + PREEMPTION_CONFIG_PREFIX + + "select_based_on_priority_of_applications"; + public static final boolean DEFAULT_SELECT_CANDIDATES_FOR_INTRAQUEUE_PREEMPTION = + true; // Change to false later. + + /** + * For intra-queue preemption, consider those queues which are above used cap limit + */ + public static final String MAX_IGNORED_OVER_CAPACITY_FOR_INTRA_QUEUE = + PREEMPTION_CONFIG_PREFIX + "max_ignored_over_capacity_for_intra_queue"; + public static final float DEFAULT_MAX_IGNORED_OVER_CAPACITY_FOR_INTRA_QUEUE = 0.9f; + + /** + * For intra-queue preemption, allowable maximum-preemptable limit per queue. + */ + public static final String MAX_ALLOWABLE_PREEMPTION_LIMIT_FOR_INTRA_QUEUE = + PREEMPTION_CONFIG_PREFIX + "max_allowable_preempt_limit_for_intra_queue"; + public static final float DEFAULT_MAX_ALLOWABLE_PREEMPTION_LIMIT_FOR_INTRA_QUEUE = 0.5f; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 1ca69be..ae5e03c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -1839,6 +1839,17 @@ public void recoverContainer(Resource clusterResource, .getSchedulableEntities()); } + /** + * Obtain (read-only) collection of all applications. + */ + public Collection getAllApplications() { + Collection apps = pendingOrderingPolicy + .getSchedulableEntities(); + apps.addAll(orderingPolicy.getSchedulableEntities()); + + return Collections.unmodifiableCollection(apps); + } + // Consider the headroom for each user in the queue. // Total pending for the queue = // sum(for each user(min((user's headroom), sum(user's pending requests)))) @@ -1870,6 +1881,21 @@ public synchronized Resource getTotalPendingResourcesConsideringUserLimit( return pendingConsideringUserLimit; } + public synchronized Resource getUserLimitHeadRoomPerApp(FiCaSchedulerApp app, + Resource resources, String partition) { + + String userName = app.getUser(); + User user = getUser(userName); + Resource headroom = Resources.subtract( + computeUserLimit(app, resources, user, partition, + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), + user.getUsed(partition)); + // Make sure headroom is not negative. + headroom = Resources.componentwiseMax(headroom, Resources.none()); + + return headroom; + } + @Override public synchronized void collectSchedulerApplications( Collection apps) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java index 9c84a23..ab83e93 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -326,6 +327,23 @@ public synchronized Resource getTotalPendingRequests() { return ret; } + public synchronized Map getTotalPendingRequestsPerPartition() { + + Map ret = new HashMap(); + Resource res = null; + for (SchedulerRequestKey key : appSchedulingInfo.getSchedulerKeys()) { + ResourceRequest rr = appSchedulingInfo.getResourceRequest(key, "*"); + if ((res = ret.get(rr.getNodeLabelExpression())) == null) { + res = Resources.createResource(0, 0); + ret.put(rr.getNodeLabelExpression(), res); + } + + Resources.addTo(res, + Resources.multiply(rr.getCapability(), rr.getNumContainers())); + } + return ret; + } + public synchronized void markContainerForPreemption(ContainerId cont) { // ignore already completed containers if (liveContainers.containsKey(cont)) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java index 3d3f1ea..60b351d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java @@ -287,6 +287,7 @@ private void mockApplications(String appsConfig) { ApplicationId appId = ApplicationId.newInstance(0L, id); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1); + System.out.println("queue: " + queueName + ", appId:" + appId) ; mockContainers(strs[1], appAttemptId, queueName, reservedContainers, liveContainers); @@ -436,7 +437,14 @@ private ParentQueue mockQueueHierarchy(String queueExprs) { new Comparator() { @Override public int compare(FiCaSchedulerApp a1, FiCaSchedulerApp a2) { - return a1.getApplicationId().compareTo(a2.getApplicationId()); + if (a1.getPriority() != null + && !a1.getPriority().equals(a2.getPriority())) { + return a1.getPriority().compareTo(a2.getPriority()); + } + + int res = a1.getApplicationAttemptId() + .compareTo(a2.getApplicationAttemptId()); + return res; } }); when(leafQueue.getApplications()).thenReturn(apps); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyIntraQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyIntraQueue.java new file mode 100644 index 0000000..48dae07 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyIntraQueue.java @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; + +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; + +import static org.mockito.Matchers.argThat; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +public class TestProportionalCapacityPreemptionPolicyIntraQueue + extends ProportionalCapacityPreemptionPolicyMockFramework { + @Before + public void setup() { + super.setup(); + policy = new ProportionalCapacityPreemptionPolicy(rmContext, cs, mClock); + } + + @Test + public void testPreemptionForSimpleReservedContainer() throws IOException { + /** + * The simplest test of reserved container, Queue structure is: + * + *
+     *       root
+     *       /  \
+     *      a    b
+     * 
+ * Guaranteed resource of a/b are 50:50 + * Total cluster resource = 100 + * - A has 90 containers on two node, n1 has 45, n2 has 45, size of each + * container is 1. + * - B has am container at n1, and reserves 1 container with size = 9 at n1, + * so B needs to preempt 9 containers from A at n1 instead of randomly + * preempt from n1 and n2. + */ + String labelsConfig = + "=100,true;"; + String nodesConfig = // n1 / n2 has no label + "n1= res=100"; + String queuesConfig = + // guaranteed,max,used,pending,reserved + "root(=[100 100 80 120 0]);" + //root + "-a(=[10 100 10 50 0]);" + // a + "-b(=[40 100 40 60 0]);" + // b + "-c(=[20 100 10 10 0]);" + // c + "-d(=[30 100 20 0 0])"; // d + + String appsConfig= + //queueName\t(priority,resource,host,expression,#repeat,reserved) + "a\t" // app1 in a + + "(1,1,n1,,5,false);" + // app1 a + "a\t" // app2 in a + + "(1,1,n1,,5,false);" + // app2 a + "b\t" // app3 in b + + "(4,1,n1,,34,false);" + // app3 b + "b\t" // app4 in b + + "(4,1,n1,,2,false);" + // app4 b + "b\t" // app4 in b + + "(5,1,n1,,1,false);" + // app5 b + "b\t" // app4 in b + + "(6,1,n1,,1,false);" + // app6 in b + "c\t" // app1 in a + + "(1,1,n1,,10,false);" + + "d\t" // app1 in a + + "(1,1,n1,,20,false)"; + + buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig); + policy.editSchedule(); + + // Total 5 preempted from app1 at n1, don't preempt container from other + // app/node +// verify(mDisp, times(5)).handle(argThat( +// new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor( +// getAppAttemptId(1)))); +// verify(mDisp, times(20)).handle(argThat( +// new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor( +// getAppAttemptId(3)))); + } +} -- 2.7.4 (Apple Git-66)