diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java index f1b4f07..01cad88 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java @@ -51,7 +51,7 @@ // {"default-rack", "hostFoo"} or "coreSwitchA/TORSwitchB", "hostBar" public static String[] getRackHostName(String hostname) { NodeBase node = new NodeBase(hostname); - return new String[] {node.getNetworkLocation().substring(1), + return new String[] {"/" + node.getNetworkLocation().substring(1), node.getName()}; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/resource/Resources.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/resource/Resources.java index 760b0ea..1ba7192 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/resource/Resources.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/resource/Resources.java @@ -162,6 +162,18 @@ public static Resource add(Resource lhs, Resource rhs) { return addTo(clone(lhs), rhs); } + public static Resource addAll(Resource[] res) { + if (res == null || res.length == 0) { + Resource total = Resources.createResource(0); + for (Resource r : res) { + Resources.addTo(total, r); + } + return total; + } + + return Resources.none(); + } + public static Resource subtractFrom(Resource lhs, Resource rhs) { lhs.setMemorySize(lhs.getMemorySize() - rhs.getMemorySize()); lhs.setVirtualCores(lhs.getVirtualCores() - rhs.getVirtualCores()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java index 9df395d..e1018d3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java @@ -98,7 +98,7 @@ // go through all ignore-partition-exclusivity containers first to make // sure such containers will be preemptionCandidates first Map> ignorePartitionExclusivityContainers = - leafQueue.getIgnoreExclusivityRMContainers(); + leafQueue.getCopyOfIgnoreExclusivityRMContainers(); for (String partition : resToObtainByPartition.keySet()) { if (ignorePartitionExclusivityContainers.containsKey(partition)) { TreeSet rmContainers = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainer.java index e5d1208..53ec5f8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainer.java @@ -25,6 +25,7 @@ import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerReport; import org.apache.hadoop.yarn.api.records.ContainerState; +import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ExecutionType; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; @@ -54,6 +55,12 @@ Resource getReservedResource(); + /* + * Return reserved resource for reserved containers, return allocated resource + * for other container + */ + Resource getAllocatedOrReservedResource(); + NodeId getReservedNode(); SchedulerRequestKey getReservedSchedulerKey(); @@ -105,4 +112,8 @@ * @return If the container was allocated remotely. */ boolean isRemotelyAllocated(); + + ContainerStatus getFinishedStatus(); + + NodeId getNodeId(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java index 706821e..ac96393 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java @@ -161,7 +161,6 @@ RMContainerEvent> stateMachine; private final ReadLock readLock; private final WriteLock writeLock; - private final ContainerId containerId; private final ApplicationAttemptId appAttemptId; private final NodeId nodeId; private final Container container; @@ -224,7 +223,6 @@ public RMContainerImpl(Container container, RMContext rmContext, long creationTime, String nodeLabelExpression, boolean isExternallyAllocated) { this.stateMachine = stateMachineFactory.make(this); - this.containerId = container.getId(); this.nodeId = nodeId; this.container = container; this.allocatedSchedulerKey = SchedulerRequestKey.extractFrom(container); @@ -263,7 +261,7 @@ public RMContainerImpl(Container container, @Override public ContainerId getContainerId() { - return this.containerId; + return this.container.getId(); } @Override @@ -374,7 +372,7 @@ public String getLogURL() { logURL.append(WebAppUtils.getHttpSchemePrefix(rmContext .getYarnConfiguration())); logURL.append(WebAppUtils.getRunningLogURL( - container.getNodeHttpAddress(), containerId.toString(), + container.getNodeHttpAddress(), getContainerId().toString(), user)); return logURL.toString(); } finally { @@ -431,7 +429,12 @@ public void setResourceRequests(List requests) { @Override public String toString() { - return containerId.toString(); + Container c = getContainer(); + if (null != c && null != c.getId()) { + return c.getId().toString(); + } else { + return "container-id-not-set"; + } } @Override @@ -476,7 +479,7 @@ public void handle(RMContainerEvent event) { } catch (InvalidStateTransitionException e) { LOG.error("Can't handle this event at current state", e); LOG.error("Invalid event " + event.getType() + - " on container " + this.containerId); + " on container " + this.getContainerId()); } if (oldState != getState()) { LOG.info(event.getContainerId() + " Container Transitioned from " @@ -488,7 +491,8 @@ public void handle(RMContainerEvent event) { writeLock.unlock(); } } - + + @Override public ContainerStatus getFinishedStatus() { return finishedStatus; } @@ -517,7 +521,7 @@ public RMContainerState transition(RMContainerImpl container, report.getContainerExitStatus()); new FinishedTransition().transition(container, - new RMContainerFinishedEvent(container.containerId, status, + new RMContainerFinishedEvent(container.getContainerId(), status, RMContainerEventType.FINISHED)); return RMContainerState.COMPLETED; } else if (report.getContainerState().equals(ContainerState.RUNNING)) { @@ -654,11 +658,11 @@ public void transition(RMContainerImpl container, RMContainerEvent event) { } else { // Something wrong happened, kill the container LOG.warn("Something wrong happened, container size reported by NM" - + " is not expected, ContainerID=" + container.containerId + + " is not expected, ContainerID=" + container.getContainerId() + " rm-size-resource:" + rmContainerResource + " nm-size-reosurce:" + nmContainerResource); container.eventHandler.handle(new RMNodeCleanContainerEvent( - container.nodeId, container.containerId)); + container.nodeId, container.getContainerId())); } } @@ -761,7 +765,7 @@ public void transition(RMContainerImpl container, RMContainerEvent event) { // Inform node container.eventHandler.handle(new RMNodeCleanContainerEvent( - container.nodeId, container.containerId)); + container.nodeId, container.getContainerId())); // Inform appAttempt super.transition(container, event); @@ -831,8 +835,8 @@ public int hashCode() { @Override public int compareTo(RMContainer o) { - if (containerId != null && o.getContainerId() != null) { - return containerId.compareTo(o.getContainerId()); + if (getContainerId() != null && o.getContainerId() != null) { + return getContainerId().compareTo(o.getContainerId()); } return -1; } @@ -865,4 +869,23 @@ public ExecutionType getExecutionType() { public boolean isRemotelyAllocated() { return isExternallyAllocated; } + + @Override + public Resource getAllocatedOrReservedResource() { + try { + readLock.lock(); + if (getState().equals(RMContainerState.RESERVED)) { + return getReservedResource(); + } else { + return getAllocatedResource(); + } + } finally { + readLock.unlock(); + } + } + + @Override + public NodeId getNodeId() { + return nodeId; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 755defd..3b9029b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -28,6 +28,7 @@ import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.locks.Lock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -136,7 +137,6 @@ public void serviceInit(Configuration conf) throws Exception { super.serviceInit(conf); } - @VisibleForTesting public ClusterNodeTracker getNodeTracker() { return nodeTracker; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java index c677345..1c37a01 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java @@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; +import org.antlr.runtime.tree.Tree; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; @@ -598,6 +599,67 @@ public synchronized void decreaseContainer( appResourceUsage.decUsed(decreaseRequest.getNodePartition(), absDelta); } + public synchronized void updateMetricsForAllocatedContainer( + ResourceRequest request, NodeType type, Container containerAllocated) { + QueueMetrics metrics = queue.getMetrics(); + if (pending) { + // once an allocation is done we assume the application is + // running from scheduler's POV. + pending = false; + metrics.runAppAttempt(applicationId, user); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("allocate: applicationId=" + applicationId + " container=" + + containerAllocated.getId() + " host=" + containerAllocated + .getNodeId().toString() + " user=" + user + " resource=" + request + .getCapability() + " type=" + type); + } + metrics.allocateResources(user, 1, request.getCapability(), true); + metrics.incrNodeTypeAggregations(user, type); + } + + /* + * In async environment, pending resource request could be updated during + * scheduling, this method checks pending request before allocating + */ + public synchronized boolean checkAllocation(NodeType type, + SchedulerNode node, SchedulerRequestKey schedulerKey) { + ResourceRequest r = resourceRequestMap.get(schedulerKey).get( + ResourceRequest.ANY); + if (r == null || r.getNumContainers() <= 0) { + return false; + } + if (type == NodeType.RACK_LOCAL || type == NodeType.NODE_LOCAL) { + r = resourceRequestMap.get(schedulerKey).get(node.getRackName()); + if (r == null || r.getNumContainers() <= 0) { + return false; + } + if (type == NodeType.NODE_LOCAL) { + r = resourceRequestMap.get(schedulerKey).get(node.getNodeName()); + if (r == null || r.getNumContainers() <= 0) { + return false; + } + } + } + + return true; + } + + public synchronized List allocate(NodeType type, + SchedulerNode node, SchedulerRequestKey schedulerKey, + Container containerAllocated) { + ResourceRequest request; + if (type == NodeType.NODE_LOCAL) { + request = resourceRequestMap.get(schedulerKey).get(node.getNodeName()); + } else if (type == NodeType.RACK_LOCAL) { + request = resourceRequestMap.get(schedulerKey).get(node.getRackName()); + } else { + request = resourceRequestMap.get(schedulerKey).get(ResourceRequest.ANY); + } + return allocate(type, node, schedulerKey, request, containerAllocated); + } + /** * Resources have been allocated to this application by the resource * scheduler. Track them. @@ -619,24 +681,10 @@ public synchronized void decreaseContainer( } else { allocateOffSwitch(request, resourceRequests); } - QueueMetrics metrics = queue.getMetrics(); - if (pending) { - // once an allocation is done we assume the application is - // running from scheduler's POV. - pending = false; - metrics.runAppAttempt(applicationId, user); - } - if (LOG.isDebugEnabled()) { - LOG.debug("allocate: applicationId=" + applicationId - + " container=" + containerAllocated.getId() - + " host=" + containerAllocated.getNodeId().toString() - + " user=" + user - + " resource=" + request.getCapability() - + " type=" + type); + if (null != containerAllocated) { + updateMetricsForAllocatedContainer(request, type, containerAllocated); } - metrics.allocateResources(user, 1, request.getCapability(), true); - metrics.incrNodeTypeAggregations(user, type); return resourceRequests; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java index e487f69..accb91d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java @@ -48,8 +48,8 @@ private static final Log LOG = LogFactory.getLog(ClusterNodeTracker.class); private ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true); - private Lock readLock = readWriteLock.readLock(); - private Lock writeLock = readWriteLock.writeLock(); + private volatile Lock readLock = readWriteLock.readLock(); + private volatile Lock writeLock = readWriteLock.writeLock(); private HashMap nodes = new HashMap<>(); private Map nodeNameToNodeMap = new HashMap<>(); @@ -65,9 +65,26 @@ private boolean forceConfiguredMaxAllocation = true; private long configuredMaxAllocationWaitTime; + // version of node list, it will be increased when adding / removing of nodes + // happens. Initially, it starts from zero, and it will be reset to 0 when go + // beyond max_long + private volatile long nodeListVersion = 0; + + private void updateNodeListVersion() { + if (nodeListVersion == Long.MAX_VALUE) { + nodeListVersion = 0; + } + } + + public long getNodeListVersion() { + return nodeListVersion; + } + public void addNode(N node) { writeLock.lock(); try { + updateNodeListVersion(); + nodes.put(node.getNodeID(), node); nodeNameToNodeMap.put(node.getNodeName(), node); @@ -90,6 +107,14 @@ public void addNode(N node) { } } + /* + * Sometimes readLocks can be used by external component to do fine-grained + * locking. + */ + public Lock getNodeListReadLock() { + return readLock; + } + public boolean exists(NodeId nodeId) { readLock.lock(); try { @@ -159,6 +184,8 @@ public N removeNode(NodeId nodeId) { LOG.warn("Attempting to remove a non-existent node " + nodeId); return null; } + + updateNodeListVersion(); nodeNameToNodeMap.remove(node.getNodeName()); // Update nodes per rack as well diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java index c4b32a8..e5c1ba4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java @@ -26,14 +26,16 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import com.google.common.collect.ConcurrentHashMultiset; import org.apache.commons.lang.time.DateUtils; import org.apache.commons.lang.time.FastDateFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; -import org.apache.hadoop.classification.InterfaceStability.Stable; import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; @@ -53,7 +55,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AggregateAppResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerChangeResourceEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEvent; @@ -71,7 +72,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; -import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; /** @@ -90,21 +90,22 @@ FastDateFormat.getInstance("EEE MMM dd HH:mm:ss Z yyyy"); private static final long MEM_AGGREGATE_ALLOCATION_CACHE_MSECS = 3000; - protected long lastMemoryAggregateAllocationUpdateTime = 0; + protected long lastResAllocationUpdateTime = 0; private long lastMemorySeconds = 0; private long lastVcoreSeconds = 0; + private volatile Object resAggregationUsageUpdateLock = new Object(); protected final AppSchedulingInfo appSchedulingInfo; protected ApplicationAttemptId attemptId; protected Map liveContainers = - new HashMap(); + new ConcurrentHashMap<>(); protected final Map> reservedContainers = new HashMap<>(); private final Multiset reReservations = - HashMultiset.create(); + ConcurrentHashMultiset.create(); - private Resource resourceLimit = Resource.newInstance(0, 0); + private volatile Resource resourceLimit = Resource.newInstance(0, 0); private boolean unmanagedAM = true; private boolean amRunning = false; private LogAggregationContext logAggregationContext; @@ -138,7 +139,8 @@ * the application successfully schedules a task (at rack or node local), it * is reset to 0. */ - Multiset schedulingOpportunities = HashMultiset.create(); + ConcurrentHashMultiset schedulingOpportunities = + ConcurrentHashMultiset.create(); /** * Count how many times the application has been given an opportunity to @@ -147,15 +149,16 @@ * incremented, and each time the application successfully schedules a task, * it is reset to 0 when schedule any task at corresponding priority. */ - Multiset missedNonPartitionedReqSchedulingOpportunity = - HashMultiset.create(); + ConcurrentHashMultiset + missedNonPartitionedReqSchedulingOpportunity = + ConcurrentHashMultiset.create(); // Time of the last container scheduled at the current allowed level protected Map lastScheduledContainer = - new HashMap<>(); + new ConcurrentHashMap<>(); protected Queue queue; - protected boolean isStopped = false; + protected volatile boolean isStopped = false; protected String appAMNodePartitionName = CommonNodeLabelsManager.NO_LABEL; @@ -163,6 +166,9 @@ private RMAppAttempt appAttempt; + protected final ReentrantReadWriteLock.ReadLock readLock; + protected final ReentrantReadWriteLock.WriteLock writeLock; + public SchedulerApplicationAttempt(ApplicationAttemptId applicationAttemptId, String user, Queue queue, ActiveUsersManager activeUsersManager, RMContext rmContext) { @@ -188,14 +194,23 @@ public SchedulerApplicationAttempt(ApplicationAttemptId applicationAttemptId, appSubmissionContext.getLogAggregationContext(); } } + + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + readLock = lock.readLock(); + writeLock = lock.writeLock(); } /** * Get the live containers of the application. * @return live containers of the application */ - public synchronized Collection getLiveContainers() { - return new ArrayList(liveContainers.values()); + public Collection getLiveContainers() { + try { + readLock.lock(); + return new ArrayList<>(liveContainers.values()); + } finally { + readLock.unlock(); + } } public AppSchedulingInfo getAppSchedulingInfo() { @@ -243,19 +258,19 @@ public long getNewContainerId() { return appSchedulingInfo.getSchedulerKeys(); } - public synchronized ResourceRequest getResourceRequest( + public ResourceRequest getResourceRequest( SchedulerRequestKey schedulerKey, String resourceName) { return appSchedulingInfo.getResourceRequest(schedulerKey, resourceName); } - public synchronized int getTotalRequiredResources( + public int getTotalRequiredResources( SchedulerRequestKey schedulerKey) { ResourceRequest request = getResourceRequest(schedulerKey, ResourceRequest.ANY); return request == null ? 0 : request.getNumContainers(); } - public synchronized Resource getResource(SchedulerRequestKey schedulerKey) { + public Resource getResource(SchedulerRequestKey schedulerKey) { return appSchedulingInfo.getResource(schedulerKey); } @@ -291,49 +306,57 @@ public boolean getUnmanagedAM() { return unmanagedAM; } - public synchronized RMContainer getRMContainer(ContainerId id) { + public RMContainer getRMContainer(ContainerId id) { return liveContainers.get(id); } - public synchronized void addRMContainer( + public void addRMContainer( ContainerId id, RMContainer rmContainer) { - liveContainers.put(id, rmContainer); - if (rmContainer.isRemotelyAllocated()) { - this.attemptResourceUsageAllocatedRemotely.incUsed( - rmContainer.getAllocatedResource()); + try { + writeLock.lock(); + liveContainers.put(id, rmContainer); + if (rmContainer.isRemotelyAllocated()) { + this.attemptResourceUsageAllocatedRemotely.incUsed( + rmContainer.getAllocatedResource()); + } + } + finally { + writeLock.unlock(); } } - public synchronized void removeRMContainer(ContainerId containerId) { - RMContainer rmContainer = liveContainers.remove(containerId); - if (rmContainer != null && rmContainer.isRemotelyAllocated()) { - this.attemptResourceUsageAllocatedRemotely.decUsed( - rmContainer.getAllocatedResource()); + public void removeRMContainer(ContainerId containerId) { + try { + writeLock.lock(); + RMContainer rmContainer = liveContainers.remove(containerId); + if (rmContainer != null && rmContainer.isRemotelyAllocated()) { + this.attemptResourceUsageAllocatedRemotely.decUsed( + rmContainer.getAllocatedResource()); + } + } finally { + writeLock.unlock(); } } - protected synchronized void resetReReservations( + protected void resetReReservations( SchedulerRequestKey schedulerKey) { reReservations.setCount(schedulerKey, 0); } - protected synchronized void addReReservation( + protected void addReReservation( SchedulerRequestKey schedulerKey) { reReservations.add(schedulerKey); } - public synchronized int getReReservations(SchedulerRequestKey schedulerKey) { + public int getReReservations(SchedulerRequestKey schedulerKey) { return reReservations.count(schedulerKey); } /** * Get total current reservations. - * Used only by unit tests * @return total current reservations */ - @Stable - @Private - public synchronized Resource getCurrentReservation() { + public Resource getCurrentReservation() { return attemptResourceUsage.getReserved(); } @@ -341,28 +364,43 @@ public Queue getQueue() { return queue; } - public synchronized boolean updateResourceRequests( + public boolean updateResourceRequests( List requests) { - if (!isStopped) { - return appSchedulingInfo.updateResourceRequests(requests, false); + try { + writeLock.lock(); + if (!isStopped) { + return appSchedulingInfo.updateResourceRequests(requests, false); + } + return false; + } finally { + writeLock.unlock(); } - return false; } - public synchronized void recoverResourceRequestsForContainer( + public void recoverResourceRequestsForContainer( List requests) { - if (!isStopped) { - appSchedulingInfo.updateResourceRequests(requests, true); + try { + writeLock.lock(); + if (!isStopped) { + appSchedulingInfo.updateResourceRequests(requests, true); + } + } finally { + writeLock.unlock(); } } - public synchronized void stop(RMAppAttemptState rmAppAttemptFinalState) { - // Cleanup all scheduling information - isStopped = true; - appSchedulingInfo.stop(); + public void stop() { + try { + writeLock.lock(); + // Cleanup all scheduling information + isStopped = true; + appSchedulingInfo.stop(); + } finally { + writeLock.unlock(); + } } - public synchronized boolean isStopped() { + public boolean isStopped() { return isStopped; } @@ -370,29 +408,38 @@ public synchronized boolean isStopped() { * Get the list of reserved containers * @return All of the reserved containers. */ - public synchronized List getReservedContainers() { - List reservedContainers = new ArrayList(); - for (Map.Entry> e : - this.reservedContainers.entrySet()) { - reservedContainers.addAll(e.getValue().values()); + public List getReservedContainers() { + try { + readLock.lock(); + List reservedContainers = new ArrayList(); + for (Map.Entry> e : this.reservedContainers + .entrySet()) { + reservedContainers.addAll(e.getValue().values()); + } + return reservedContainers; + } finally { + readLock.unlock(); } - return reservedContainers; } - public synchronized boolean reserveIncreasedContainer(SchedulerNode node, + public boolean reserveIncreasedContainer(SchedulerNode node, SchedulerRequestKey schedulerKey, RMContainer rmContainer, Resource reservedResource) { - if (commonReserve(node, schedulerKey, rmContainer, reservedResource)) { - attemptResourceUsage.incReserved(node.getPartition(), - reservedResource); - // succeeded - return true; + try { + writeLock.lock(); + if (commonReserve(node, schedulerKey, rmContainer, reservedResource)) { + attemptResourceUsage.incReserved(node.getPartition(), reservedResource); + // succeeded + return true; + } + + return false; + } finally { + writeLock.unlock(); } - - return false; } - private synchronized boolean commonReserve(SchedulerNode node, + private boolean commonReserve(SchedulerNode node, SchedulerRequestKey schedulerKey, RMContainer rmContainer, Resource reservedResource) { try { @@ -423,48 +470,35 @@ private synchronized boolean commonReserve(SchedulerNode node, return true; } - public synchronized RMContainer reserve(SchedulerNode node, + public RMContainer reserve(SchedulerNode node, SchedulerRequestKey schedulerKey, RMContainer rmContainer, Container container) { - // Create RMContainer if necessary - if (rmContainer == null) { - rmContainer = - new RMContainerImpl(container, getApplicationAttemptId(), - node.getNodeID(), appSchedulingInfo.getUser(), rmContext); - attemptResourceUsage.incReserved(node.getPartition(), - container.getResource()); - ((RMContainerImpl)rmContainer).setQueueName(this.getQueueName()); - - // Reset the re-reservation count - resetReReservations(schedulerKey); - } else { - // Note down the re-reservation - addReReservation(schedulerKey); - } - - commonReserve(node, schedulerKey, rmContainer, container.getResource()); + try { + writeLock.lock(); - return rmContainer; - } - - /** - * Has the application reserved the given node at the - * given priority? - * @param node node to be checked - * @param schedulerKey scheduler key of reserved container - * @return true is reserved, false if not - */ - public synchronized boolean isReserved(SchedulerNode node, - SchedulerRequestKey schedulerKey) { - Map reservedContainers = - this.reservedContainers.get(schedulerKey); - if (reservedContainers != null) { - return reservedContainers.containsKey(node.getNodeID()); + // Create RMContainer if necessary + if (rmContainer == null) { + rmContainer = new RMContainerImpl(container, getApplicationAttemptId(), + node.getNodeID(), appSchedulingInfo.getUser(), rmContext); + attemptResourceUsage.incReserved(node.getPartition(), container.getResource()); + ((RMContainerImpl) rmContainer).setQueueName(this.getQueueName()); + + // Reset the re-reservation count + resetReReservations(schedulerKey); + } else { + // Note down the re-reservation + addReReservation(schedulerKey); + } + + commonReserve(node, schedulerKey, rmContainer, container.getResource()); + + return rmContainer; + } finally { + writeLock.unlock(); } - return false; } - public synchronized void setHeadroom(Resource globalLimit) { + public void setHeadroom(Resource globalLimit) { this.resourceLimit = globalLimit; } @@ -472,53 +506,66 @@ public synchronized void setHeadroom(Resource globalLimit) { * Get available headroom in terms of resources for the application's user. * @return available resource headroom */ - public synchronized Resource getHeadroom() { - // Corner case to deal with applications being slightly over-limit - if (resourceLimit.getMemorySize() < 0) { - resourceLimit.setMemorySize(0); + public Resource getHeadroom() { + Resource copy = Resources.clone(resourceLimit); + if (copy.getMemorySize() < 0) { + copy.setMemorySize(0); } - - return resourceLimit; + return copy; } - public synchronized int getNumReservedContainers( + public int getNumReservedContainers( SchedulerRequestKey schedulerKey) { - Map reservedContainers = - this.reservedContainers.get(schedulerKey); - return (reservedContainers == null) ? 0 : reservedContainers.size(); + try { + readLock.lock(); + Map reservedContainers = this.reservedContainers.get( + schedulerKey); + return (reservedContainers == null) ? 0 : reservedContainers.size(); + } finally { + readLock.unlock(); + } } @SuppressWarnings("unchecked") - public synchronized void containerLaunchedOnNode(ContainerId containerId, + public void containerLaunchedOnNode(ContainerId containerId, NodeId nodeId) { - // Inform the container - RMContainer rmContainer = getRMContainer(containerId); - if (rmContainer == null) { - // Some unknown container sneaked into the system. Kill it. - rmContext.getDispatcher().getEventHandler() - .handle(new RMNodeCleanContainerEvent(nodeId, containerId)); - return; - } + try { + readLock.lock(); + // Inform the container + RMContainer rmContainer = getRMContainer(containerId); + if (rmContainer == null) { + // Some unknown container sneaked into the system. Kill it. + rmContext.getDispatcher().getEventHandler().handle( + new RMNodeCleanContainerEvent(nodeId, containerId)); + return; + } - rmContainer.handle(new RMContainerEvent(containerId, - RMContainerEventType.LAUNCHED)); + rmContainer.handle( + new RMContainerEvent(containerId, RMContainerEventType.LAUNCHED)); + } + finally { + readLock.unlock(); + } } - public synchronized void showRequests() { - if (LOG.isDebugEnabled()) { - for (SchedulerRequestKey schedulerKey : getSchedulerKeys()) { - Map requests = - getResourceRequests(schedulerKey); - if (requests != null) { - LOG.debug("showRequests:" + " application=" + getApplicationId() - + " headRoom=" + getHeadroom() + " currentConsumption=" - + attemptResourceUsage.getUsed().getMemorySize()); - for (ResourceRequest request : requests.values()) { - LOG.debug("showRequests:" + " application=" + getApplicationId() - + " request=" + request); + public void showRequests() { + try { + readLock.lock(); + if (LOG.isDebugEnabled()) { + for (SchedulerRequestKey schedulerKey : getSchedulerKeys()) { + Map requests = getResourceRequests( + schedulerKey); + if (requests != null) { + LOG.debug("showRequests:" + " application=" + getApplicationId() + " headRoom=" + getHeadroom() + " currentConsumption=" + + attemptResourceUsage.getUsed().getMemorySize()); + for (ResourceRequest request : requests.values()) { + LOG.debug("showRequests:" + " application=" + getApplicationId() + " request=" + request); + } } } } + } finally { + readLock.unlock(); } } @@ -568,75 +615,95 @@ private Container updateContainerAndNMToken(RMContainer rmContainer, // Create container token and update NMToken altogether, if either of them fails for // some reason like DNS unavailable, do not return this container and keep it // in the newlyAllocatedContainers waiting to be refetched. - public synchronized List pullNewlyAllocatedContainers() { - List returnContainerList = - new ArrayList(newlyAllocatedContainers.size()); - for (Iterator i = newlyAllocatedContainers.iterator(); i - .hasNext();) { - RMContainer rmContainer = i.next(); - Container updatedContainer = - updateContainerAndNMToken(rmContainer, true, false); - // Only add container to return list when it's not null. updatedContainer - // could be null when generate token failed, it can be caused by DNS - // resolving failed. - if (updatedContainer != null) { - returnContainerList.add(updatedContainer); - i.remove(); + public List pullNewlyAllocatedContainers() { + try { + writeLock.lock(); + List returnContainerList = new ArrayList( + newlyAllocatedContainers.size()); + for (Iterator i = newlyAllocatedContainers.iterator(); + i.hasNext(); ) { + RMContainer rmContainer = i.next(); + Container updatedContainer = updateContainerAndNMToken(rmContainer, true, false); + // Only add container to return list when it's not null. updatedContainer + // could be null when generate token failed, it can be caused by DNS + // resolving failed. + if (updatedContainer != null) { + returnContainerList.add(updatedContainer); + i.remove(); + } } + return returnContainerList; + } finally { + writeLock.unlock(); } - return returnContainerList; } - private synchronized List pullNewlyUpdatedContainers( + private List pullNewlyUpdatedContainers( Map updatedContainerMap, boolean increase) { - List returnContainerList = - new ArrayList(updatedContainerMap.size()); - for (Iterator> i = - updatedContainerMap.entrySet().iterator(); i.hasNext();) { - RMContainer rmContainer = i.next().getValue(); - Container updatedContainer = - updateContainerAndNMToken(rmContainer, false, increase); - if (updatedContainer != null) { - returnContainerList.add(updatedContainer); - i.remove(); + try { + writeLock.lock(); + List returnContainerList = new ArrayList( + updatedContainerMap.size()); + for (Iterator> i = + updatedContainerMap.entrySet().iterator(); i.hasNext(); ) { + RMContainer rmContainer = i.next().getValue(); + Container updatedContainer = updateContainerAndNMToken(rmContainer, + false, increase); + if (updatedContainer != null) { + returnContainerList.add(updatedContainer); + i.remove(); + } } + return returnContainerList; + } finally { + writeLock.unlock(); } - return returnContainerList; } - public synchronized List pullNewlyIncreasedContainers() { + public List pullNewlyIncreasedContainers() { return pullNewlyUpdatedContainers(newlyIncreasedContainers, true); } - public synchronized List pullNewlyDecreasedContainers() { + public List pullNewlyDecreasedContainers() { return pullNewlyUpdatedContainers(newlyDecreasedContainers, false); } - public synchronized List pullUpdatedNMTokens() { + public List pullUpdatedNMTokens() { List returnList = new ArrayList(updatedNMTokens); updatedNMTokens.clear(); return returnList; } public boolean isWaitingForAMContainer() { - // The working knowledge is that masterContainer for AM is null as it - // itself is the master container. - return (!unmanagedAM && appAttempt.getMasterContainer() == null); + try { + readLock.lock(); + + // The working knowledge is that masterContainer for AM is null as it + // itself is the master container. + return (!unmanagedAM && appAttempt.getMasterContainer() == null); + } finally { + readLock.unlock(); + } } - public synchronized void updateBlacklist(List blacklistAdditions, + public void updateBlacklist(List blacklistAdditions, List blacklistRemovals) { - if (!isStopped) { - if (isWaitingForAMContainer()) { - // The request is for the AM-container, and the AM-container is launched - // by the system. So, update the places that are blacklisted by system - // (as opposed to those blacklisted by the application). - this.appSchedulingInfo.updatePlacesBlacklistedBySystem( - blacklistAdditions, blacklistRemovals); - } else { - this.appSchedulingInfo.updatePlacesBlacklistedByApp(blacklistAdditions, - blacklistRemovals); + try { + writeLock.lock(); + if (!isStopped) { + if (isWaitingForAMContainer()) { + // The request is for the AM-container, and the AM-container is launched + // by the system. So, update the places that are blacklisted by system + // (as opposed to those blacklisted by the application). + this.appSchedulingInfo.updatePlacesBlacklistedBySystem( + blacklistAdditions, blacklistRemovals); + } else { + this.appSchedulingInfo.updatePlacesBlacklistedByApp(blacklistAdditions, + blacklistRemovals); + } } + } finally { + writeLock.unlock(); } } @@ -646,31 +713,29 @@ public boolean isPlaceBlacklisted(String resourceName) { forAMContainer); } - public synchronized int addMissedNonPartitionedRequestSchedulingOpportunity( + public int addMissedNonPartitionedRequestSchedulingOpportunity( SchedulerRequestKey schedulerKey) { - missedNonPartitionedReqSchedulingOpportunity.add(schedulerKey); - return missedNonPartitionedReqSchedulingOpportunity.count(schedulerKey); + return missedNonPartitionedReqSchedulingOpportunity.add(schedulerKey, 1) + + 1; } - public synchronized void - resetMissedNonPartitionedRequestSchedulingOpportunity( + public void resetMissedNonPartitionedRequestSchedulingOpportunity( SchedulerRequestKey schedulerKey) { missedNonPartitionedReqSchedulingOpportunity.setCount(schedulerKey, 0); } - public synchronized void addSchedulingOpportunity( + public void addSchedulingOpportunity( SchedulerRequestKey schedulerKey) { - int count = schedulingOpportunities.count(schedulerKey); - if (count < Integer.MAX_VALUE) { - schedulingOpportunities.setCount(schedulerKey, count + 1); + try { + schedulingOpportunities.add(schedulerKey, 1); + } catch (IllegalArgumentException e) { + // ignore ... } } - public synchronized void subtractSchedulingOpportunity( - SchedulerRequestKey schedulerKey) { - int count = schedulingOpportunities.count(schedulerKey) - 1; - this.schedulingOpportunities.setCount(schedulerKey, Math.max(count, 0)); + public void subtractSchedulingOpportunity(SchedulerRequestKey schedulerKey) { + schedulingOpportunities.remove(schedulerKey, 1); } /** @@ -680,7 +745,7 @@ public synchronized void subtractSchedulingOpportunity( * @param schedulerKey Scheduler Key * @return number of scheduling opportunities */ - public synchronized int getSchedulingOpportunities( + public int getSchedulingOpportunities( SchedulerRequestKey schedulerKey) { return schedulingOpportunities.count(schedulerKey); } @@ -692,13 +757,13 @@ public synchronized int getSchedulingOpportunities( * * @param schedulerKey The priority of the container scheduled. */ - public synchronized void resetSchedulingOpportunities( + public void resetSchedulingOpportunities( SchedulerRequestKey schedulerKey) { resetSchedulingOpportunities(schedulerKey, System.currentTimeMillis()); } // used for continuous scheduling - public synchronized void resetSchedulingOpportunities( + public void resetSchedulingOpportunities( SchedulerRequestKey schedulerKey, long currentTimeMs) { lastScheduledContainer.put(schedulerKey, currentTimeMs); schedulingOpportunities.setCount(schedulerKey, 0); @@ -709,145 +774,174 @@ void setSchedulingOpportunities(SchedulerRequestKey schedulerKey, int count) { schedulingOpportunities.setCount(schedulerKey, count); } - synchronized AggregateAppResourceUsage getRunningAggregateAppResourceUsage() { + private AggregateAppResourceUsage getRunningAggregateAppResourceUsage() { long currentTimeMillis = System.currentTimeMillis(); - // Don't walk the whole container list if the resources were computed - // recently. - if ((currentTimeMillis - lastMemoryAggregateAllocationUpdateTime) - > MEM_AGGREGATE_ALLOCATION_CACHE_MSECS) { - long memorySeconds = 0; - long vcoreSeconds = 0; - for (RMContainer rmContainer : this.liveContainers.values()) { - long usedMillis = currentTimeMillis - rmContainer.getCreationTime(); - Resource resource = rmContainer.getContainer().getResource(); - memorySeconds += resource.getMemorySize() * usedMillis / - DateUtils.MILLIS_PER_SECOND; - vcoreSeconds += resource.getVirtualCores() * usedMillis - / DateUtils.MILLIS_PER_SECOND; - } + synchronized (resAggregationUsageUpdateLock) { + // Don't walk the whole container list if the resources were computed + // recently. + if ((currentTimeMillis - lastResAllocationUpdateTime) + > MEM_AGGREGATE_ALLOCATION_CACHE_MSECS) { + long memorySeconds = 0; + long vcoreSeconds = 0; + for (RMContainer rmContainer : this.liveContainers.values()) { + long usedMillis = currentTimeMillis - rmContainer.getCreationTime(); + Resource resource = rmContainer.getContainer().getResource(); + memorySeconds += resource.getMemorySize() * usedMillis + / DateUtils.MILLIS_PER_SECOND; + vcoreSeconds += resource.getVirtualCores() * usedMillis + / DateUtils.MILLIS_PER_SECOND; + } - lastMemoryAggregateAllocationUpdateTime = currentTimeMillis; - lastMemorySeconds = memorySeconds; - lastVcoreSeconds = vcoreSeconds; + lastResAllocationUpdateTime = currentTimeMillis; + lastMemorySeconds = memorySeconds; + lastVcoreSeconds = vcoreSeconds; + } + return new AggregateAppResourceUsage(lastMemorySeconds, lastVcoreSeconds); } - return new AggregateAppResourceUsage(lastMemorySeconds, lastVcoreSeconds); - } - - public synchronized ApplicationResourceUsageReport getResourceUsageReport() { - AggregateAppResourceUsage runningResourceUsage = - getRunningAggregateAppResourceUsage(); - Resource usedResourceClone = - Resources.clone(attemptResourceUsage.getAllUsed()); - Resource reservedResourceClone = - Resources.clone(attemptResourceUsage.getReserved()); - Resource cluster = rmContext.getScheduler().getClusterResource(); - ResourceCalculator calc = rmContext.getScheduler().getResourceCalculator(); - float queueUsagePerc = 0.0f; - float clusterUsagePerc = 0.0f; - if (!calc.isInvalidDivisor(cluster)) { - queueUsagePerc = - calc.divide(cluster, usedResourceClone, Resources.multiply(cluster, - queue.getQueueInfo(false, false).getCapacity())) * 100; - clusterUsagePerc = calc.divide(cluster, usedResourceClone, cluster) * 100; + } + + public ApplicationResourceUsageReport getResourceUsageReport() { + try { + readLock.lock(); + AggregateAppResourceUsage runningResourceUsage = + getRunningAggregateAppResourceUsage(); + Resource usedResourceClone = Resources.clone( + attemptResourceUsage.getAllUsed()); + Resource reservedResourceClone = Resources.clone( + attemptResourceUsage.getReserved()); + Resource cluster = rmContext.getScheduler().getClusterResource(); + ResourceCalculator calc = rmContext.getScheduler().getResourceCalculator(); + float queueUsagePerc = 0.0f; + float clusterUsagePerc = 0.0f; + if (!calc.isInvalidDivisor(cluster)) { + queueUsagePerc = calc.divide(cluster, usedResourceClone, Resources + .multiply(cluster, queue.getQueueInfo(false, false).getCapacity())) + * 100; + clusterUsagePerc = calc.divide(cluster, usedResourceClone, cluster) * 100; + } + return ApplicationResourceUsageReport.newInstance(liveContainers.size(), + reservedContainers.size(), usedResourceClone, reservedResourceClone, + Resources.add(usedResourceClone, reservedResourceClone), + runningResourceUsage.getMemorySeconds(), runningResourceUsage.getVcoreSeconds(), queueUsagePerc, + clusterUsagePerc); + } finally { + readLock.unlock(); } - return ApplicationResourceUsageReport.newInstance(liveContainers.size(), - reservedContainers.size(), usedResourceClone, reservedResourceClone, - Resources.add(usedResourceClone, reservedResourceClone), - runningResourceUsage.getMemorySeconds(), - runningResourceUsage.getVcoreSeconds(), queueUsagePerc, - clusterUsagePerc); } - public synchronized Map getLiveContainersMap() { + public Map getLiveContainersMap() { return this.liveContainers; } - public synchronized Resource getResourceLimit() { + public Resource getResourceLimit() { return this.resourceLimit; } - public synchronized Map + public Map getLastScheduledContainer() { return this.lastScheduledContainer; } - public synchronized void transferStateFromPreviousAttempt( + public void transferStateFromPreviousAttempt( SchedulerApplicationAttempt appAttempt) { - this.liveContainers = appAttempt.getLiveContainersMap(); - // this.reReservations = appAttempt.reReservations; - this.attemptResourceUsage.copyAllUsed(appAttempt.attemptResourceUsage); - this.resourceLimit = appAttempt.getResourceLimit(); - // this.currentReservation = appAttempt.currentReservation; - // this.newlyAllocatedContainers = appAttempt.newlyAllocatedContainers; - // this.schedulingOpportunities = appAttempt.schedulingOpportunities; - this.lastScheduledContainer = appAttempt.getLastScheduledContainer(); - this.appSchedulingInfo - .transferStateFromPreviousAppSchedulingInfo(appAttempt.appSchedulingInfo); + try { + writeLock.lock(); + this.liveContainers = appAttempt.getLiveContainersMap(); + // this.reReservations = appAttempt.reReservations; + this.attemptResourceUsage.copyAllUsed(appAttempt.attemptResourceUsage); + this.resourceLimit = appAttempt.getResourceLimit(); + // this.currentReservation = appAttempt.currentReservation; + // this.newlyAllocatedContainers = appAttempt.newlyAllocatedContainers; + // this.schedulingOpportunities = appAttempt.schedulingOpportunities; + this.lastScheduledContainer = appAttempt.getLastScheduledContainer(); + this.appSchedulingInfo.transferStateFromPreviousAppSchedulingInfo( + appAttempt.appSchedulingInfo); + } finally { + writeLock.unlock(); + } } - public synchronized void move(Queue newQueue) { - QueueMetrics oldMetrics = queue.getMetrics(); - QueueMetrics newMetrics = newQueue.getMetrics(); - String newQueueName = newQueue.getQueueName(); - String user = getUser(); - for (RMContainer liveContainer : liveContainers.values()) { - Resource resource = liveContainer.getContainer().getResource(); - ((RMContainerImpl)liveContainer).setQueueName(newQueueName); - oldMetrics.releaseResources(user, 1, resource); - newMetrics.allocateResources(user, 1, resource, false); - } - for (Map map : reservedContainers.values()) { - for (RMContainer reservedContainer : map.values()) { - ((RMContainerImpl)reservedContainer).setQueueName(newQueueName); - Resource resource = reservedContainer.getReservedResource(); - oldMetrics.unreserveResource(user, resource); - newMetrics.reserveResource(user, resource); + public void move(Queue newQueue) { + try { + writeLock.lock(); + QueueMetrics oldMetrics = queue.getMetrics(); + QueueMetrics newMetrics = newQueue.getMetrics(); + String newQueueName = newQueue.getQueueName(); + String user = getUser(); + for (RMContainer liveContainer : liveContainers.values()) { + Resource resource = liveContainer.getContainer().getResource(); + ((RMContainerImpl) liveContainer).setQueueName(newQueueName); + oldMetrics.releaseResources(user, 1, resource); + newMetrics.allocateResources(user, 1, resource, false); + } + for (Map map : reservedContainers.values()) { + for (RMContainer reservedContainer : map.values()) { + ((RMContainerImpl) reservedContainer).setQueueName(newQueueName); + Resource resource = reservedContainer.getReservedResource(); + oldMetrics.unreserveResource(user, resource); + newMetrics.reserveResource(user, resource); + } } - } - appSchedulingInfo.move(newQueue); - this.queue = newQueue; + appSchedulingInfo.move(newQueue); + this.queue = newQueue; + } finally { + writeLock.unlock(); + } } - public synchronized void recoverContainer(SchedulerNode node, + public void recoverContainer(SchedulerNode node, RMContainer rmContainer) { - // recover app scheduling info - appSchedulingInfo.recoverContainer(rmContainer); + try { + writeLock.lock(); + // recover app scheduling info + appSchedulingInfo.recoverContainer(rmContainer); - if (rmContainer.getState().equals(RMContainerState.COMPLETED)) { - return; + if (rmContainer.getState().equals(RMContainerState.COMPLETED)) { + return; + } + LOG.info("SchedulerAttempt " + getApplicationAttemptId() + " is recovering container " + rmContainer.getContainerId()); + liveContainers.put(rmContainer.getContainerId(), rmContainer); + attemptResourceUsage.incUsed(node.getPartition(), + rmContainer.getContainer().getResource()); + + // resourceLimit: updated when LeafQueue#recoverContainer#allocateResource + // is called. + // newlyAllocatedContainers.add(rmContainer); + // schedulingOpportunities + // lastScheduledContainer + } finally { + writeLock.unlock(); } - LOG.info("SchedulerAttempt " + getApplicationAttemptId() - + " is recovering container " + rmContainer.getContainerId()); - liveContainers.put(rmContainer.getContainerId(), rmContainer); - attemptResourceUsage.incUsed(node.getPartition(), rmContainer - .getContainer().getResource()); - - // resourceLimit: updated when LeafQueue#recoverContainer#allocateResource - // is called. - // newlyAllocatedContainers.add(rmContainer); - // schedulingOpportunities - // lastScheduledContainer } public void incNumAllocatedContainers(NodeType containerType, NodeType requestType) { - RMAppAttempt attempt = - rmContext.getRMApps().get(attemptId.getApplicationId()) - .getCurrentAppAttempt(); - if (attempt != null) { - attempt.getRMAppAttemptMetrics().incNumAllocatedContainers(containerType, - requestType); + try { + writeLock.lock(); + RMAppAttempt attempt = rmContext.getRMApps().get( + attemptId.getApplicationId()).getCurrentAppAttempt(); + if (attempt != null) { + attempt.getRMAppAttemptMetrics().incNumAllocatedContainers( + containerType, requestType); + } + } finally { + writeLock.unlock(); } } public void setApplicationHeadroomForMetrics(Resource headroom) { - RMAppAttempt attempt = - rmContext.getRMApps().get(attemptId.getApplicationId()) - .getCurrentAppAttempt(); - if (attempt != null) { - attempt.getRMAppAttemptMetrics().setApplicationAttemptHeadRoom( - Resources.clone(headroom)); + try { + writeLock.lock(); + RMAppAttempt attempt = rmContext.getRMApps().get( + attemptId.getApplicationId()).getCurrentAppAttempt(); + if (attempt != null) { + attempt.getRMAppAttemptMetrics().setApplicationAttemptHeadRoom( + Resources.clone(headroom)); + } + } + finally { + writeLock.unlock(); } } @@ -911,49 +1005,64 @@ public ResourceUsage getSchedulingResourceUsage() { return attemptResourceUsage; } - public synchronized boolean removeIncreaseRequest(NodeId nodeId, + public boolean removeIncreaseRequest(NodeId nodeId, SchedulerRequestKey schedulerKey, ContainerId containerId) { - return appSchedulingInfo.removeIncreaseRequest(nodeId, schedulerKey, - containerId); + try { + writeLock.lock(); + return appSchedulingInfo.removeIncreaseRequest(nodeId, schedulerKey, + containerId); + } finally { + writeLock.unlock(); + } } - public synchronized boolean updateIncreaseRequests( + public boolean updateIncreaseRequests( List increaseRequests) { - return appSchedulingInfo.updateIncreaseRequests(increaseRequests); + + try { + writeLock.lock(); + return appSchedulingInfo.updateIncreaseRequests(increaseRequests); + } finally { + writeLock.unlock(); + } } - private synchronized void changeContainerResource( + private void changeContainerResource( SchedContainerChangeRequest changeRequest, boolean increase) { - if (increase) { - appSchedulingInfo.increaseContainer(changeRequest); - } else { - appSchedulingInfo.decreaseContainer(changeRequest); - } + try { + writeLock.lock(); + if (increase) { + appSchedulingInfo.increaseContainer(changeRequest); + } else { + appSchedulingInfo.decreaseContainer(changeRequest); + } - RMContainer changedRMContainer = changeRequest.getRMContainer(); - changedRMContainer.handle( - new RMContainerChangeResourceEvent(changeRequest.getContainerId(), - changeRequest.getTargetCapacity(), increase)); - - // remove pending and not pulled by AM newly-increased/decreased-containers - // and add the new one - if (increase) { - newlyDecreasedContainers.remove(changeRequest.getContainerId()); - newlyIncreasedContainers.put(changeRequest.getContainerId(), - changedRMContainer); - } else { - newlyIncreasedContainers.remove(changeRequest.getContainerId()); - newlyDecreasedContainers.put(changeRequest.getContainerId(), - changedRMContainer); + RMContainer changedRMContainer = changeRequest.getRMContainer(); + changedRMContainer.handle(new RMContainerChangeResourceEvent(changeRequest.getContainerId(), + changeRequest.getTargetCapacity(), increase)); + + // remove pending and not pulled by AM newly-increased/decreased-containers + // and add the new one + if (increase) { + newlyDecreasedContainers.remove(changeRequest.getContainerId()); + newlyIncreasedContainers.put(changeRequest.getContainerId(), + changedRMContainer); + } else { + newlyIncreasedContainers.remove(changeRequest.getContainerId()); + newlyDecreasedContainers.put(changeRequest.getContainerId(), + changedRMContainer); + } + } finally { + writeLock.unlock(); } } - public synchronized void decreaseContainer( + public void decreaseContainer( SchedContainerChangeRequest decreaseRequest) { changeContainerResource(decreaseRequest, false); } - public synchronized void increaseContainer( + public void increaseContainer( SchedContainerChangeRequest increaseRequest) { changeContainerResource(increaseRequest, true); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java index 2efdbd0..69595ce 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java @@ -72,6 +72,9 @@ private volatile Set labels = null; + // What's the latest time that this node visited by scheduler + protected volatile long latestVisitedTimestamp = -1; + public SchedulerNode(RMNode node, boolean usePortForNodeName, Set labels) { this.rmNode = node; @@ -432,4 +435,8 @@ public void setNodeUtilization(ResourceUtilization nodeUtilization) { public ResourceUtilization getNodeUtilization() { return this.nodeUtilization; } + + public long getLatestVisitedTimestamp() { + return this.latestVisitedTimestamp; + } } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesLogger.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesLogger.java index 8fa1bb5..aeafa8e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesLogger.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesLogger.java @@ -25,6 +25,7 @@ import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; @@ -48,6 +49,10 @@ public static void recordSkippedAppActivityWithoutAllocation( ActivitiesManager activitiesManager, SchedulerNode node, SchedulerApplicationAttempt application, Priority priority, String diagnostic) { + // FIXME, global scheduling + if (node == null) { + return; + } recordAppActivityWithoutAllocation(activitiesManager, node, application, priority, diagnostic, ActivityState.SKIPPED); } @@ -60,6 +65,11 @@ public static void recordRejectedAppActivityFromLeafQueue( ActivitiesManager activitiesManager, SchedulerNode node, SchedulerApplicationAttempt application, Priority priority, String diagnostic) { + // FIXME, global scheduling + if (node == null) { + return; + } + String type = "app"; recordActivity(activitiesManager, node, application.getQueueName(), application.getApplicationId().toString(), priority, @@ -80,6 +90,10 @@ public static void recordAppActivityWithoutAllocation( if (activitiesManager == null) { return; } + // FIXME, global scheduling + if (node == null) { + return; + } if (activitiesManager.shouldRecordThisNode(node.getNodeID())) { String type = "container"; // Add application-container activity into specific node allocation. @@ -112,19 +126,23 @@ public static void recordAppActivityWithoutAllocation( */ public static void recordAppActivityWithAllocation( ActivitiesManager activitiesManager, SchedulerNode node, - SchedulerApplicationAttempt application, Container updatedContainer, + SchedulerApplicationAttempt application, RMContainer updatedContainer, ActivityState activityState) { if (activitiesManager == null) { return; } + // FIXME: global-scheduling + if (node == null) { + return; + } if (activitiesManager.shouldRecordThisNode(node.getNodeID())) { String type = "container"; // Add application-container activity into specific node allocation. activitiesManager.addSchedulingActivityForNode(node.getNodeID(), application.getApplicationId().toString(), - updatedContainer.getId().toString(), - updatedContainer.getPriority().toString(), activityState, - ActivityDiagnosticConstant.EMPTY, type); + updatedContainer.getContainer().toString(), + updatedContainer.getContainer().getPriority().toString(), + activityState, ActivityDiagnosticConstant.EMPTY, type); type = "app"; // Add queue-application activity into specific node allocation. activitiesManager.addSchedulingActivityForNode(node.getNodeID(), @@ -138,9 +156,10 @@ public static void recordAppActivityWithAllocation( application.getApplicationId())) { String type = "container"; activitiesManager.addSchedulingActivityForApp( - application.getApplicationId(), updatedContainer.getId().toString(), - updatedContainer.getPriority().toString(), activityState, - ActivityDiagnosticConstant.EMPTY, type); + application.getApplicationId(), + updatedContainer.getContainerId().toString(), + updatedContainer.getContainer().getPriority().toString(), + activityState, ActivityDiagnosticConstant.EMPTY, type); } } @@ -149,12 +168,17 @@ public static void recordAppActivityWithAllocation( * update. */ public static void startAppAllocationRecording( - ActivitiesManager activitiesManager, NodeId nodeId, long currentTime, + ActivitiesManager activitiesManager, SchedulerNode node, long currentTime, SchedulerApplicationAttempt application) { + // FIXME: Global scheduling caused issue + if (null == node) { + return; + } + if (activitiesManager == null) { return; } - activitiesManager.startAppAllocationRecording(nodeId, currentTime, + activitiesManager.startAppAllocationRecording(node.getNodeID(), currentTime, application); } @@ -230,6 +254,10 @@ public static void finishAllocatedNodeAllocation( if (activitiesManager == null) { return; } + // FIXME, global scheduling + if (node == null) { + return; + } if (activitiesManager.shouldRecordThisNode(node.getNodeID())) { activitiesManager.updateAllocationFinalState(node.getNodeID(), containerId, containerState); @@ -266,6 +294,10 @@ private static void recordActivity(ActivitiesManager activitiesManager, if (activitiesManager == null) { return; } + // FIXME: global-scheduling + if (node == null) { + return; + } if (activitiesManager.shouldRecordThisNode(node.getNodeID())) { activitiesManager.addSchedulingActivityForNode(node.getNodeID(), parentName, childName, priority != null ? priority.toString() : null, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesManager.java index 4fa5feb..00178c2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/activities/ActivitiesManager.java @@ -277,6 +277,10 @@ boolean shouldRecordThisApp(ApplicationId applicationId) { } boolean shouldRecordThisNode(NodeId nodeID) { + // FIXME, global scheduling caused issue + if (nodeID == null) { + return false; + } return activeRecordedNodes.contains(nodeID) && recordingNodesAllocation .containsKey(nodeID); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java index 1d8f929..5145082 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java @@ -24,7 +24,10 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -53,6 +56,12 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ContainerAllocationContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ResourceCommitRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; @@ -60,19 +69,19 @@ public abstract class AbstractCSQueue implements CSQueue { private static final Log LOG = LogFactory.getLog(AbstractCSQueue.class); - CSQueue parent; + volatile CSQueue parent; final String queueName; volatile int numContainers; final Resource minimumAllocation; volatile Resource maximumAllocation; - QueueState state; + volatile QueueState state; final CSQueueMetrics metrics; protected final PrivilegedEntity queueEntity; final ResourceCalculator resourceCalculator; Set accessibleLabels; - RMNodeLabelsManager labelManager; + volatile RMNodeLabelsManager labelManager; String defaultLabelExpression; Map acls = @@ -94,6 +103,9 @@ protected ActivitiesManager activitiesManager; + protected ReentrantReadWriteLock.ReadLock readLock; + protected ReentrantReadWriteLock.WriteLock writeLock; + public AbstractCSQueue(CapacitySchedulerContext cs, String queueName, CSQueue parent, CSQueue old) throws IOException { this.labelManager = cs.getRMContext().getNodeLabelManager(); @@ -116,7 +128,11 @@ public AbstractCSQueue(CapacitySchedulerContext cs, queueEntity = new PrivilegedEntity(EntityType.QUEUE, getQueuePath()); // initialize QueueCapacities - queueCapacities = new QueueCapacities(parent == null); + queueCapacities = new QueueCapacities(parent == null); + + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + readLock = lock.readLock(); + writeLock = lock.writeLock(); } protected void setupConfigurableCapacities() { @@ -187,13 +203,13 @@ public PrivilegedEntity getPrivilegedEntity() { } @Override - public synchronized CSQueue getParent() { + public CSQueue getParent() { return parent; } @Override - public synchronized void setParent(CSQueue newParentQueue) { - this.parent = (ParentQueue)newParentQueue; + public void setParent(CSQueue newParentQueue) { + this.parent = newParentQueue; } public Set getAccessibleNodeLabels() { @@ -221,18 +237,22 @@ public void setAbsoluteUsedCapacity(float absUsedCapacity) { * Set maximum capacity - used only for testing. * @param maximumCapacity new max capacity */ - synchronized void setMaxCapacity(float maximumCapacity) { - // Sanity check - CSQueueUtils.checkMaxCapacity(getQueueName(), - queueCapacities.getCapacity(), maximumCapacity); - float absMaxCapacity = - CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent); - CSQueueUtils.checkAbsoluteCapacity(getQueueName(), - queueCapacities.getAbsoluteCapacity(), - absMaxCapacity); - - queueCapacities.setMaximumCapacity(maximumCapacity); - queueCapacities.setAbsoluteMaximumCapacity(absMaxCapacity); + void setMaxCapacity(float maximumCapacity) { + try { + writeLock.lock(); + // Sanity check + CSQueueUtils.checkMaxCapacity(getQueueName(), + queueCapacities.getCapacity(), maximumCapacity); + float absMaxCapacity = CSQueueUtils.computeAbsoluteMaximumCapacity( + maximumCapacity, parent); + CSQueueUtils.checkAbsoluteCapacity(getQueueName(), + queueCapacities.getAbsoluteCapacity(), absMaxCapacity); + + queueCapacities.setMaximumCapacity(maximumCapacity); + queueCapacities.setAbsoluteMaximumCapacity(absMaxCapacity); + } finally { + writeLock.unlock(); + } } @Override @@ -240,105 +260,124 @@ public String getDefaultNodeLabelExpression() { return defaultLabelExpression; } - synchronized void setupQueueConfigs(Resource clusterResource) + void setupQueueConfigs(Resource clusterResource) throws IOException { - // get labels - this.accessibleLabels = - csContext.getConfiguration().getAccessibleNodeLabels(getQueuePath()); - this.defaultLabelExpression = csContext.getConfiguration() - .getDefaultNodeLabelExpression(getQueuePath()); - - // inherit from parent if labels not set - if (this.accessibleLabels == null && parent != null) { - this.accessibleLabels = parent.getAccessibleNodeLabels(); - } - - // inherit from parent if labels not set - if (this.defaultLabelExpression == null && parent != null - && this.accessibleLabels.containsAll(parent.getAccessibleNodeLabels())) { - this.defaultLabelExpression = parent.getDefaultNodeLabelExpression(); - } + try { + writeLock.lock(); + + // get labels + this.accessibleLabels = + csContext.getConfiguration().getAccessibleNodeLabels(getQueuePath()); + this.defaultLabelExpression = + csContext.getConfiguration().getDefaultNodeLabelExpression( + getQueuePath()); + + // inherit from parent if labels not set + if (this.accessibleLabels == null && parent != null) { + this.accessibleLabels = parent.getAccessibleNodeLabels(); + } - // After we setup labels, we can setup capacities - setupConfigurableCapacities(); - - this.maximumAllocation = - csContext.getConfiguration().getMaximumAllocationPerQueue( - getQueuePath()); - - authorizer = YarnAuthorizationProvider.getInstance(csContext.getConf()); - - this.state = csContext.getConfiguration().getState(getQueuePath()); - this.acls = csContext.getConfiguration().getAcls(getQueuePath()); + // inherit from parent if labels not set + if (this.defaultLabelExpression == null && parent != null + && this.accessibleLabels.containsAll( + parent.getAccessibleNodeLabels())) { + this.defaultLabelExpression = parent.getDefaultNodeLabelExpression(); + } - // Update metrics - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); - - // Check if labels of this queue is a subset of parent queue, only do this - // when we not root - if (parent != null && parent.getParent() != null) { - if (parent.getAccessibleNodeLabels() != null - && !parent.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { - // if parent isn't "*", child shouldn't be "*" too - if (this.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { - throw new IOException("Parent's accessible queue is not ANY(*), " - + "but child's accessible queue is *"); - } else { - Set diff = - Sets.difference(this.getAccessibleNodeLabels(), - parent.getAccessibleNodeLabels()); - if (!diff.isEmpty()) { - throw new IOException("Some labels of child queue is not a subset " - + "of parent queue, these labels=[" - + StringUtils.join(diff, ",") + "]"); + // After we setup labels, we can setup capacities + setupConfigurableCapacities(); + + this.maximumAllocation = + csContext.getConfiguration().getMaximumAllocationPerQueue( + getQueuePath()); + + authorizer = YarnAuthorizationProvider.getInstance(csContext.getConf()); + + this.state = csContext.getConfiguration().getState(getQueuePath()); + this.acls = csContext.getConfiguration().getAcls(getQueuePath()); + + // Update metrics + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); + + // Check if labels of this queue is a subset of parent queue, only do this + // when we not root + if (parent != null && parent.getParent() != null) { + if (parent.getAccessibleNodeLabels() != null && !parent + .getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { + // if parent isn't "*", child shouldn't be "*" too + if (this.getAccessibleNodeLabels().contains( + RMNodeLabelsManager.ANY)) { + throw new IOException("Parent's accessible queue is not ANY(*), " + + "but child's accessible queue is *"); + } else { + Set diff = Sets.difference(this.getAccessibleNodeLabels(), + parent.getAccessibleNodeLabels()); + if (!diff.isEmpty()) { + throw new IOException( + "Some labels of child queue is not a subset " + + "of parent queue, these labels=[" + StringUtils + .join(diff, ",") + "]"); + } } } } - } - this.reservationsContinueLooking = csContext.getConfiguration() - .getReservationContinueLook(); + this.reservationsContinueLooking = + csContext.getConfiguration().getReservationContinueLook(); - this.preemptionDisabled = isQueueHierarchyPreemptionDisabled(this); + this.preemptionDisabled = isQueueHierarchyPreemptionDisabled(this); + } finally { + writeLock.unlock(); + } } protected QueueInfo getQueueInfo() { - QueueInfo queueInfo = recordFactory.newRecordInstance(QueueInfo.class); - queueInfo.setQueueName(queueName); - queueInfo.setAccessibleNodeLabels(accessibleLabels); - queueInfo.setCapacity(queueCapacities.getCapacity()); - queueInfo.setMaximumCapacity(queueCapacities.getMaximumCapacity()); - queueInfo.setQueueState(state); - queueInfo.setDefaultNodeLabelExpression(defaultLabelExpression); - queueInfo.setCurrentCapacity(getUsedCapacity()); - queueInfo.setQueueStatistics(getQueueStatistics()); - queueInfo.setPreemptionDisabled(preemptionDisabled); - return queueInfo; + try { + readLock.lock(); + QueueInfo queueInfo = recordFactory.newRecordInstance(QueueInfo.class); + queueInfo.setQueueName(queueName); + queueInfo.setAccessibleNodeLabels(accessibleLabels); + queueInfo.setCapacity(queueCapacities.getCapacity()); + queueInfo.setMaximumCapacity(queueCapacities.getMaximumCapacity()); + queueInfo.setQueueState(state); + queueInfo.setDefaultNodeLabelExpression(defaultLabelExpression); + queueInfo.setCurrentCapacity(getUsedCapacity()); + queueInfo.setQueueStatistics(getQueueStatistics()); + queueInfo.setPreemptionDisabled(preemptionDisabled); + return queueInfo; + } finally { + readLock.unlock(); + } } public QueueStatistics getQueueStatistics() { - QueueStatistics stats = - recordFactory.newRecordInstance(QueueStatistics.class); - stats.setNumAppsSubmitted(getMetrics().getAppsSubmitted()); - stats.setNumAppsRunning(getMetrics().getAppsRunning()); - stats.setNumAppsPending(getMetrics().getAppsPending()); - stats.setNumAppsCompleted(getMetrics().getAppsCompleted()); - stats.setNumAppsKilled(getMetrics().getAppsKilled()); - stats.setNumAppsFailed(getMetrics().getAppsFailed()); - stats.setNumActiveUsers(getMetrics().getActiveUsers()); - stats.setAvailableMemoryMB(getMetrics().getAvailableMB()); - stats.setAllocatedMemoryMB(getMetrics().getAllocatedMB()); - stats.setPendingMemoryMB(getMetrics().getPendingMB()); - stats.setReservedMemoryMB(getMetrics().getReservedMB()); - stats.setAvailableVCores(getMetrics().getAvailableVirtualCores()); - stats.setAllocatedVCores(getMetrics().getAllocatedVirtualCores()); - stats.setPendingVCores(getMetrics().getPendingVirtualCores()); - stats.setReservedVCores(getMetrics().getReservedVirtualCores()); - stats.setPendingContainers(getMetrics().getPendingContainers()); - stats.setAllocatedContainers(getMetrics().getAllocatedContainers()); - stats.setReservedContainers(getMetrics().getReservedContainers()); - return stats; + try { + readLock.lock(); + QueueStatistics stats = recordFactory.newRecordInstance( + QueueStatistics.class); + stats.setNumAppsSubmitted(getMetrics().getAppsSubmitted()); + stats.setNumAppsRunning(getMetrics().getAppsRunning()); + stats.setNumAppsPending(getMetrics().getAppsPending()); + stats.setNumAppsCompleted(getMetrics().getAppsCompleted()); + stats.setNumAppsKilled(getMetrics().getAppsKilled()); + stats.setNumAppsFailed(getMetrics().getAppsFailed()); + stats.setNumActiveUsers(getMetrics().getActiveUsers()); + stats.setAvailableMemoryMB(getMetrics().getAvailableMB()); + stats.setAllocatedMemoryMB(getMetrics().getAllocatedMB()); + stats.setPendingMemoryMB(getMetrics().getPendingMB()); + stats.setReservedMemoryMB(getMetrics().getReservedMB()); + stats.setAvailableVCores(getMetrics().getAvailableVirtualCores()); + stats.setAllocatedVCores(getMetrics().getAllocatedVirtualCores()); + stats.setPendingVCores(getMetrics().getPendingVirtualCores()); + stats.setReservedVCores(getMetrics().getReservedVirtualCores()); + stats.setPendingContainers(getMetrics().getPendingContainers()); + stats.setAllocatedContainers(getMetrics().getAllocatedContainers()); + stats.setReservedContainers(getMetrics().getReservedContainers()); + return stats; + } finally { + readLock.unlock(); + } } @Private @@ -351,26 +390,37 @@ public Resource getMinimumAllocation() { return minimumAllocation; } - synchronized void allocateResource(Resource clusterResource, + void allocateResource(Resource clusterResource, Resource resource, String nodePartition, boolean changeContainerResource) { - queueUsage.incUsed(nodePartition, resource); + try { + writeLock.lock(); + queueUsage.incUsed(nodePartition, resource); - if (!changeContainerResource) { - ++numContainers; + if (!changeContainerResource) { + ++numContainers; + } + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, nodePartition); + } finally { + writeLock.unlock(); } - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, nodePartition); } - protected synchronized void releaseResource(Resource clusterResource, + protected void releaseResource(Resource clusterResource, Resource resource, String nodePartition, boolean changeContainerResource) { - queueUsage.decUsed(nodePartition, resource); + try { + writeLock.lock(); - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, nodePartition); + queueUsage.decUsed(nodePartition, resource); - if (!changeContainerResource) { - --numContainers; + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, nodePartition); + + if (!changeContainerResource) { + --numContainers; + } + } finally { + writeLock.unlock(); } } @@ -464,86 +514,88 @@ Resource getQueueMaxResource(String nodePartition, Resource clusterResource) { minimumAllocation); } - synchronized boolean canAssignToThisQueue(Resource clusterResource, + boolean canAssignToThisQueue(Resource clusterResource, String nodePartition, ResourceLimits currentResourceLimits, Resource resourceCouldBeUnreserved, SchedulingMode schedulingMode) { - // Get current limited resource: - // - When doing RESPECT_PARTITION_EXCLUSIVITY allocation, we will respect - // queues' max capacity. - // - When doing IGNORE_PARTITION_EXCLUSIVITY allocation, we will not respect - // queue's max capacity, queue's max capacity on the partition will be - // considered to be 100%. Which is a queue can use all resource in the - // partition. - // Doing this because: for non-exclusive allocation, we make sure there's - // idle resource on the partition, to avoid wastage, such resource will be - // leveraged as much as we can, and preemption policy will reclaim it back - // when partitoned-resource-request comes back. - Resource currentLimitResource = - getCurrentLimitResource(nodePartition, clusterResource, - currentResourceLimits, schedulingMode); - - Resource nowTotalUsed = queueUsage.getUsed(nodePartition); - - // Set headroom for currentResourceLimits: - // When queue is a parent queue: Headroom = limit - used + killable - // When queue is a leaf queue: Headroom = limit - used (leaf queue cannot preempt itself) - Resource usedExceptKillable = nowTotalUsed; - if (null != getChildQueues() && !getChildQueues().isEmpty()) { - usedExceptKillable = Resources.subtract(nowTotalUsed, - getTotalKillableResource(nodePartition)); - } - currentResourceLimits.setHeadroom( - Resources.subtract(currentLimitResource, usedExceptKillable)); - - if (Resources.greaterThanOrEqual(resourceCalculator, clusterResource, - usedExceptKillable, currentLimitResource)) { - - // if reservation continous looking enabled, check to see if could we - // potentially use this node instead of a reserved node if the application - // has reserved containers. - // TODO, now only consider reservation cases when the node has no label - if (this.reservationsContinueLooking - && nodePartition.equals(RMNodeLabelsManager.NO_LABEL) - && Resources.greaterThan(resourceCalculator, clusterResource, - resourceCouldBeUnreserved, Resources.none())) { - // resource-without-reserved = used - reserved - Resource newTotalWithoutReservedResource = - Resources.subtract(usedExceptKillable, resourceCouldBeUnreserved); - - // when total-used-without-reserved-resource < currentLimit, we still - // have chance to allocate on this node by unreserving some containers - if (Resources.lessThan(resourceCalculator, clusterResource, - newTotalWithoutReservedResource, currentLimitResource)) { - if (LOG.isDebugEnabled()) { - LOG.debug("try to use reserved: " + getQueueName() - + " usedResources: " + queueUsage.getUsed() - + ", clusterResources: " + clusterResource - + ", reservedResources: " + resourceCouldBeUnreserved - + ", capacity-without-reserved: " - + newTotalWithoutReservedResource + ", maxLimitCapacity: " - + currentLimitResource); + try { + + readLock.lock(); + // Get current limited resource: + // - When doing RESPECT_PARTITION_EXCLUSIVITY allocation, we will respect + // queues' max capacity. + // - When doing IGNORE_PARTITION_EXCLUSIVITY allocation, we will not respect + // queue's max capacity, queue's max capacity on the partition will be + // considered to be 100%. Which is a queue can use all resource in the + // partition. + // Doing this because: for non-exclusive allocation, we make sure there's + // idle resource on the partition, to avoid wastage, such resource will be + // leveraged as much as we can, and preemption policy will reclaim it back + // when partitoned-resource-request comes back. + Resource currentLimitResource = getCurrentLimitResource(nodePartition, + clusterResource, currentResourceLimits, schedulingMode); + + Resource nowTotalUsed = queueUsage.getUsed(nodePartition); + + // Set headroom for currentResourceLimits: + // When queue is a parent queue: Headroom = limit - used + killable + // When queue is a leaf queue: Headroom = limit - used (leaf queue cannot preempt itself) + Resource usedExceptKillable = nowTotalUsed; + if (null != getChildQueues() && !getChildQueues().isEmpty()) { + usedExceptKillable = Resources.subtract(nowTotalUsed, + getTotalKillableResource(nodePartition)); + } + currentResourceLimits.setHeadroom( + Resources.subtract(currentLimitResource, usedExceptKillable)); + + if (Resources.greaterThanOrEqual(resourceCalculator, clusterResource, + usedExceptKillable, currentLimitResource)) { + + // if reservation continous looking enabled, check to see if could we + // potentially use this node instead of a reserved node if the application + // has reserved containers. + // TODO, now only consider reservation cases when the node has no label + if (this.reservationsContinueLooking && nodePartition.equals( + RMNodeLabelsManager.NO_LABEL) && Resources.greaterThan( + resourceCalculator, clusterResource, resourceCouldBeUnreserved, + Resources.none())) { + // resource-without-reserved = used - reserved + Resource newTotalWithoutReservedResource = Resources.subtract( + usedExceptKillable, resourceCouldBeUnreserved); + + // when total-used-without-reserved-resource < currentLimit, we still + // have chance to allocate on this node by unreserving some containers + if (Resources.lessThan(resourceCalculator, clusterResource, + newTotalWithoutReservedResource, currentLimitResource)) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "try to use reserved: " + getQueueName() + " usedResources: " + + queueUsage.getUsed() + ", clusterResources: " + + clusterResource + ", reservedResources: " + + resourceCouldBeUnreserved + + ", capacity-without-reserved: " + + newTotalWithoutReservedResource + ", maxLimitCapacity: " + + currentLimitResource); + } + return true; } - return true; } + if (LOG.isDebugEnabled()) { + LOG.debug(getQueueName() + "Check assign to queue, nodePartition=" + + nodePartition + " usedResources: " + queueUsage + .getUsed(nodePartition) + " clusterResources: " + clusterResource + + " currentUsedCapacity " + Resources + .divide(resourceCalculator, clusterResource, + queueUsage.getUsed(nodePartition), labelManager + .getResourceByLabel(nodePartition, clusterResource)) + + " max-capacity: " + queueCapacities + .getAbsoluteMaximumCapacity(nodePartition) + ")"); + } + return false; } - if (LOG.isDebugEnabled()) { - LOG.debug(getQueueName() - + "Check assign to queue, nodePartition=" - + nodePartition - + " usedResources: " - + queueUsage.getUsed(nodePartition) - + " clusterResources: " - + clusterResource - + " currentUsedCapacity " - + Resources.divide(resourceCalculator, clusterResource, - queueUsage.getUsed(nodePartition), - labelManager.getResourceByLabel(nodePartition, clusterResource)) - + " max-capacity: " - + queueCapacities.getAbsoluteMaximumCapacity(nodePartition) + ")"); - } - return false; + return true; + } finally { + readLock.unlock(); } - return true; } @Override @@ -637,22 +689,26 @@ boolean hasPendingResourceRequest(String nodePartition, } public boolean accessibleToPartition(String nodePartition) { - // if queue's label is *, it can access any node - if (accessibleLabels != null - && accessibleLabels.contains(RMNodeLabelsManager.ANY)) { - return true; - } - // any queue can access to a node without label - if (nodePartition == null - || nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { - return true; - } - // a queue can access to a node only if it contains any label of the node - if (accessibleLabels != null && accessibleLabels.contains(nodePartition)) { - return true; + try { + readLock.lock(); + // if queue's label is *, it can access any node + if (accessibleLabels != null && accessibleLabels.contains( + RMNodeLabelsManager.ANY)) { + return true; + } + // any queue can access to a node without label + if (nodePartition == null || nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { + return true; + } + // a queue can access to a node only if it contains any label of the node + if (accessibleLabels != null && accessibleLabels.contains(nodePartition)) { + return true; + } + // sorry, you cannot access + return false; + } finally { + readLock.unlock(); } - // sorry, you cannot access - return false; } @Override @@ -663,24 +719,30 @@ public Priority getDefaultApplicationPriority() { @Override public Set getNodeLabelsForQueue() { - // if queue's label is *, queue can access any labels. Instead of - // considering all labels in cluster, only those labels which are - // use some resource of this queue can be considered. - Set nodeLabels = new HashSet(); - if (this.getAccessibleNodeLabels() != null && this.getAccessibleNodeLabels() - .contains(RMNodeLabelsManager.ANY)) { - nodeLabels.addAll(Sets.union(this.getQueueCapacities().getNodePartitionsSet(), - this.getQueueResourceUsage().getNodePartitionsSet())); - } else { - nodeLabels.addAll(this.getAccessibleNodeLabels()); - } + try { + readLock.lock(); + // if queue's label is *, queue can access any labels. Instead of + // considering all labels in cluster, only those labels which are + // use some resource of this queue can be considered. + Set nodeLabels = new HashSet(); + if (this.getAccessibleNodeLabels() != null && this + .getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { + nodeLabels.addAll( + Sets.union(this.getQueueCapacities().getNodePartitionsSet(), + this.getQueueResourceUsage().getNodePartitionsSet())); + } else { + nodeLabels.addAll(this.getAccessibleNodeLabels()); + } - // Add NO_LABEL also to this list as NO_LABEL also can be granted with - // resource in many general cases. - if (!nodeLabels.contains(RMNodeLabelsManager.NO_LABEL)) { - nodeLabels.add(RMNodeLabelsManager.NO_LABEL); + // Add NO_LABEL also to this list as NO_LABEL also can be granted with + // resource in many general cases. + if (!nodeLabels.contains(RMNodeLabelsManager.NO_LABEL)) { + nodeLabels.add(RMNodeLabelsManager.NO_LABEL); + } + return nodeLabels; + } finally { + readLock.unlock(); } - return nodeLabels; } public Resource getTotalKillableResource(String partition) { @@ -692,4 +754,65 @@ public Resource getTotalKillableResource(String partition) { return csContext.getPreemptionManager().getKillableContainers(queueName, partition); } + + // Only for testing + @VisibleForTesting + public CSAssignment assignContainers(Resource clusterResource, + FiCaSchedulerNode node, ResourceLimits currentResourceLimits, + SchedulingMode schedulingMode) { + try { + writeLock.lock(); + return assignContainers(clusterResource, + new PlacementSet(node, ImmutableMap.of(node.getNodeID(), node), + node.getPartition()), currentResourceLimits, schedulingMode); + } finally { + writeLock.unlock(); + } + } + + public boolean acceptCSAssignment(Resource cluster, + ResourceCommitRequest request) { + // If we allocated something + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationContext + allocation = request.getFirstAllocatedOrReservedContainer(); + SchedulerContainer + schedulerContainer = allocation.getAllocatedOrReservedContainer(); + + // Do not check when allocating new container from a reserved container + if (allocation.getAllocateFromReservedContainer() == null) { + Resource required = allocation.getAllocatedOrReservedResource(); + Resource netAllocated = Resources.subtract(required, + request.getTotalReleasedResource()); + + try { + readLock.lock(); + + String partition = schedulerContainer.getNodePartition(); + Resource maxResourceLimit; + if (allocation.getSchedulingMode() + == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY) { + maxResourceLimit = getQueueMaxResource(partition, cluster); + } else { + maxResourceLimit = labelManager.getResourceByLabel( + schedulerContainer.getNodePartition(), cluster); + } + if (!Resources.fitsIn(resourceCalculator, cluster, + Resources.add(queueUsage.getUsed(partition), netAllocated), + maxResourceLimit)) { + return false; + } + } + finally { + readLock.unlock(); + } + } + } + + if (parent != null) { + return ((AbstractCSQueue) parent).acceptCSAssignment(cluster, request); + } + + return true; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java index 7bea9af..de11b34 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java @@ -22,8 +22,10 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.AssignmentInformation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.Resources; import java.util.List; @@ -53,6 +55,12 @@ } private boolean fulfilledReservation; + + // TODO, set it. Set when fulfilledReservation = true + private RMContainer fulfilledReservedContainer; + + private SchedulingMode schedulingMode; + private final AssignmentInformation assignmentInformation; private boolean increaseAllocation; private List containersToKill; @@ -173,4 +181,21 @@ public void setContainersToKill(List containersToKill) { public List getContainersToKill() { return containersToKill; } + + public RMContainer getFulfilledReservedContainer() { + return fulfilledReservedContainer; + } + + public void setFulfilledReservedContainer( + RMContainer fulfilledReservedContainer) { + this.fulfilledReservedContainer = fulfilledReservedContainer; + } + + public SchedulingMode getSchedulingMode() { + return schedulingMode; + } + + public void setSchedulingMode(SchedulingMode schedulingMode) { + this.schedulingMode = schedulingMode; + } } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java index daf7790..a03fccc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Set; +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceStability.Stable; import org.apache.hadoop.security.AccessControlException; @@ -42,6 +43,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; @@ -195,13 +197,18 @@ public void finishApplicationAttempt(FiCaSchedulerApp application, /** * Assign containers to applications in the queue or it's children (if any). * @param clusterResource the resource of the cluster. - * @param node node on which resources are available - * @param resourceLimits how much overall resource of this queue can use. - * @param schedulingMode Type of exclusive check when assign container on a + * @param placementSet node on which resources are available + * @param resourceLimits how much overall resource of this queue can use. + * @param schedulingMode Type of exclusive check when assign container on a * NodeManager, see {@link SchedulingMode}. * @return the assignment */ public CSAssignment assignContainers(Resource clusterResource, + PlacementSet placementSet, ResourceLimits resourceLimits, + SchedulingMode schedulingMode); + + @VisibleForTesting + public CSAssignment assignContainers(Resource clusterResource, FiCaSchedulerNode node, ResourceLimits resourceLimits, SchedulingMode schedulingMode); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 3b8599e..daa140d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -18,23 +18,9 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Random; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicBoolean; - +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -115,8 +101,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.AllocationState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.KillableContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ContainerAllocationContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ResourceCommitRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ResourceCommitterHandler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.AssignmentInformation; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; @@ -138,8 +129,22 @@ import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; @LimitedPrivate("yarn") @Evolving @@ -150,7 +155,7 @@ private static final Log LOG = LogFactory.getLog(CapacityScheduler.class); private YarnAuthorizationProvider authorizer; - + private CSQueue root; // timeout to join when we stop this service protected final long THREAD_JOIN_TIMEOUT_MS = 1000; @@ -172,15 +177,18 @@ public int compare(CSQueue q1, CSQueue q2) { return q1.getQueuePath().compareTo(q2.getQueuePath()); } }; - + static final PartitionedQueueComparator partitionedQueueComparator = new PartitionedQueueComparator(); + private ResourceCommitterHandler + resourceCommitRequestHandler; + @Override public void setConf(Configuration conf) { yarnConf = conf; } - + private void validateConf(Configuration conf) { // validate scheduler memory allocation setting int minMem = conf.getInt( @@ -234,7 +242,11 @@ public Configuration getConf() { private boolean scheduleAsynchronously; private AsyncScheduleThread asyncSchedulerThread; - private RMNodeLabelsManager labelManager; + + private boolean scheduleGlobally; + private GlobalSchedulingThread globalSchedulingThread; + + volatile private RMNodeLabelsManager labelManager; private SchedulerHealth schedulerHealth = new SchedulerHealth(); volatile long lastNodeUpdateTime; @@ -259,14 +271,14 @@ public QueueMetrics getRootQueueMetrics() { public CSQueue getRootQueue() { return root; } - + @Override public CapacitySchedulerConfiguration getConfiguration() { return conf; } @Override - public synchronized RMContainerTokenSecretManager + public synchronized RMContainerTokenSecretManager getContainerTokenSecretManager() { return this.rmContext.getContainerTokenSecretManager(); } @@ -280,7 +292,7 @@ public ResourceCalculator getResourceCalculator() { public Comparator getNonPartitionedQueueComparator() { return nonPartitionedQueueComparator; } - + @Override public PartitionedQueueComparator getPartitionedQueueComparator() { return partitionedQueueComparator; @@ -317,6 +329,7 @@ private synchronized void initScheduler(Configuration configuration) throws initializeQueues(this.conf); this.isLazyPreemptionEnabled = conf.getLazyPreemptionEnabled(); + // Initialize async scheduling parameters scheduleAsynchronously = this.conf.getScheduleAynschronously(); asyncScheduleInterval = this.conf.getLong(ASYNC_SCHEDULER_INTERVAL, @@ -325,12 +338,21 @@ private synchronized void initScheduler(Configuration configuration) throws asyncSchedulerThread = new AsyncScheduleThread(this); } + // Initialize global scheduling parameters + scheduleGlobally = this.conf.getBoolean( + CapacitySchedulerConfiguration.SCHEDULE_GLOBALLY_ENABLE, false); + if (scheduleGlobally) { + globalSchedulingThread = new GlobalSchedulingThread(this); + } + + LOG.info("Initialized CapacityScheduler with " + "calculator=" + getResourceCalculator().getClass() + ", " + "minimumAllocation=<" + getMinimumResourceCapability() + ">, " + "maximumAllocation=<" + getMaximumResourceCapability() + ">, " + "asynchronousScheduling=" + scheduleAsynchronously + ", " + - "asyncScheduleInterval=" + asyncScheduleInterval + "ms"); + "asyncScheduleInterval=" + asyncScheduleInterval + "ms, " + + "globalScheduling=" + scheduleGlobally); } private synchronized void startSchedulerThreads() { @@ -339,6 +361,12 @@ private synchronized void startSchedulerThreads() { "asyncSchedulerThread is null"); asyncSchedulerThread.start(); } + + if (scheduleGlobally) { + Preconditions.checkNotNull(globalSchedulingThread, + "globalSchedulerThread is null"); + globalSchedulingThread.start(); + } } @Override @@ -346,12 +374,15 @@ public void serviceInit(Configuration conf) throws Exception { Configuration configuration = new Configuration(conf); super.serviceInit(conf); initScheduler(configuration); + resourceCommitRequestHandler = new ResourceCommitterHandler<>(this); + resourceCommitRequestHandler.init(conf); } @Override public void serviceStart() throws Exception { startSchedulerThreads(); activitiesManager.start(); + resourceCommitRequestHandler.start(); super.serviceStart(); } @@ -363,6 +394,7 @@ public void serviceStop() throws Exception { asyncSchedulerThread.join(THREAD_JOIN_TIMEOUT_MS); } } + resourceCommitRequestHandler.stop(); super.serviceStop(); } @@ -386,40 +418,17 @@ public void serviceStop() throws Exception { // update lazy preemption this.isLazyPreemptionEnabled = this.conf.getLazyPreemptionEnabled(); } - + long getAsyncScheduleInterval() { return asyncScheduleInterval; } private final static Random random = new Random(System.currentTimeMillis()); - - /** - * Schedule on all nodes by starting at a random point. - * @param cs - */ - static void schedule(CapacityScheduler cs) { - // First randomize the start point - int current = 0; - Collection nodes = cs.nodeTracker.getAllNodes(); - int start = random.nextInt(nodes.size()); - for (FiCaSchedulerNode node : nodes) { - if (current++ >= start) { - cs.allocateContainersToNode(node); - } - } - // Now, just get everyone to be safe - for (FiCaSchedulerNode node : nodes) { - cs.allocateContainersToNode(node); - } - try { - Thread.sleep(cs.getAsyncScheduleInterval()); - } catch (InterruptedException e) {} - } - + static class AsyncScheduleThread extends Thread { - private final CapacityScheduler cs; - private AtomicBoolean runSchedules = new AtomicBoolean(false); + final CapacityScheduler cs; + AtomicBoolean runSchedules = new AtomicBoolean(false); public AsyncScheduleThread(CapacityScheduler cs) { this.cs = cs; @@ -434,11 +443,34 @@ public void run() { Thread.sleep(100); } catch (InterruptedException ie) {} } else { - schedule(cs); + asyncSchedule(cs); } } } + /** + * Schedule on all nodes by starting at a random point. + * @param cs + */ + static void asyncSchedule(CapacityScheduler cs) { + // First randomize the start point + int current = 0; + Collection nodes = cs.nodeTracker.getAllNodes(); + int start = random.nextInt(nodes.size()); + for (FiCaSchedulerNode node : nodes) { + if (current++ >= start) { + cs.allocateContainersToNode(node); + } + } + // Now, just get everyone to be safe + for (FiCaSchedulerNode node : nodes) { + cs.allocateContainersToNode(node); + } + try { + Thread.sleep(cs.getAsyncScheduleInterval()); + } catch (InterruptedException e) {} + } + public void beginSchedule() { runSchedules.set(true); } @@ -448,9 +480,9 @@ public void suspendSchedule() { } } - + @Private - public static final String ROOT_QUEUE = + public static final String ROOT_QUEUE = CapacitySchedulerConfiguration.PREFIX + CapacitySchedulerConfiguration.ROOT; static class QueueHook { @@ -495,16 +527,20 @@ public CSQueue hook(CSQueue queue) { return null; } + public boolean globalSchedulingEnabled() { + return scheduleGlobally; + } + private void updatePlacementRules() throws IOException { List placementRules = new ArrayList<>(); - + // Initialize UserGroupMappingPlacementRule // TODO, need make this defineable by configuration. UserGroupMappingPlacementRule ugRule = getUserGroupMappingPlacementRule(); if (null != ugRule) { placementRules.add(ugRule); } - + rmContext.getQueuePlacementManager().updateRules(placementRules); } @@ -512,8 +548,8 @@ private void updatePlacementRules() throws IOException { private void initializeQueues(CapacitySchedulerConfiguration conf) throws IOException { - root = - parseQueue(this, conf, null, CapacitySchedulerConfiguration.ROOT, + root = + parseQueue(this, conf, null, CapacitySchedulerConfiguration.ROOT, queues, queues, noop); labelManager.reinitializeQueueLabels(getQueueToLabels()); LOG.info("Initialized root queue " + root); @@ -525,20 +561,21 @@ private void initializeQueues(CapacitySchedulerConfiguration conf) } @Lock(CapacityScheduler.class) - private void reinitializeQueues(CapacitySchedulerConfiguration conf) + private void reinitializeQueues(CapacitySchedulerConfiguration conf) throws IOException { // Parse new queues Map newQueues = new HashMap(); + CSQueue newRoot = parseQueue(this, conf, null, CapacitySchedulerConfiguration.ROOT, newQueues, queues, noop); - + // Ensure all existing queues are still present validateExistingQueues(queues, newQueues); // Add new queues addNewQueues(queues, newQueues); - + // Re-configure queues root.reinitialize(newRoot, getClusterResource()); updatePlacementRules(); @@ -582,14 +619,14 @@ public static void setQueueAcls(YarnAuthorizationProvider authorizer, */ @Lock(CapacityScheduler.class) private void validateExistingQueues( - Map queues, Map newQueues) + Map queues, Map newQueues) throws IOException { // check that all static queues are included in the newQueues list for (Map.Entry e : queues.entrySet()) { if (!(e.getValue() instanceof ReservationQueue)) { String queueName = e.getKey(); CSQueue oldQueue = e.getValue(); - CSQueue newQueue = newQueues.get(queueName); + CSQueue newQueue = newQueues.get(queueName); if (null == newQueue) { throw new IOException(queueName + " cannot be found during refresh!"); } else if (!oldQueue.getQueuePath().equals(newQueue.getQueuePath())) { @@ -609,7 +646,7 @@ private void validateExistingQueues( */ @Lock(CapacityScheduler.class) private void addNewQueues( - Map queues, Map newQueues) + Map queues, Map newQueues) { for (Map.Entry e : newQueues.entrySet()) { String queueName = e.getKey(); @@ -619,19 +656,19 @@ private void addNewQueues( } } } - + @Lock(CapacityScheduler.class) static CSQueue parseQueue( CapacitySchedulerContext csContext, - CapacitySchedulerConfiguration conf, + CapacitySchedulerConfiguration conf, CSQueue parent, String queueName, Map queues, - Map oldQueues, + Map oldQueues, QueueHook hook) throws IOException { CSQueue queue; String fullQueueName = (parent == null) ? queueName : (parent.getQueuePath() + "." + queueName); - String[] childQueueNames = + String[] childQueueNames = conf.getQueues(fullQueueName); boolean isReservableQueue = conf.isReservable(fullQueueName); if (childQueueNames == null || childQueueNames.length == 0) { @@ -666,8 +703,8 @@ static CSQueue parseQueue( List childQueues = new ArrayList(); for (String childQueueName : childQueueNames) { - CSQueue childQueue = - parseQueue(csContext, conf, queue, childQueueName, + CSQueue childQueue = + parseQueue(csContext, conf, queue, childQueueName, queues, oldQueues, hook); childQueues.add(childQueue); } @@ -766,7 +803,7 @@ private synchronized void addApplication(ApplicationId applicationId, return; } if (!(queue instanceof LeafQueue)) { - String message = "Application " + applicationId + + String message = "Application " + applicationId + " submitted by user " + user + " to non-leaf queue: " + queueName; this.rmContext.getDispatcher().getEventHandler() .handle(new RMAppEvent(applicationId, @@ -870,7 +907,7 @@ private synchronized void doneApplicationAttempt( RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers) { LOG.info("Application Attempt " + applicationAttemptId + " is done." + " finalState=" + rmAppAttemptFinalState); - + FiCaSchedulerApp attempt = getApplicationAttempt(applicationAttemptId); SchedulerApplication application = applications.get(applicationAttemptId.getApplicationId()); @@ -906,7 +943,7 @@ private synchronized void doneApplicationAttempt( } // Clean up pending requests, metrics etc. - attempt.stop(rmAppAttemptFinalState); + attempt.stop(); // Inform the queue String queueName = attempt.getQueue().getQueueName(); @@ -1022,8 +1059,8 @@ public Allocation allocate(ApplicationAttemptId applicationAttemptId, @Override @Lock(Lock.NoLock.class) - public QueueInfo getQueueInfo(String queueName, - boolean includeChildQueues, boolean recursive) + public QueueInfo getQueueInfo(String queueName, + boolean includeChildQueues, boolean recursive) throws IOException { CSQueue queue = null; queue = this.queues.get(queueName); @@ -1056,7 +1093,7 @@ private synchronized void nodeUpdate(RMNode nm) { Resource releaseResources = Resource.newInstance(0, 0); FiCaSchedulerNode node = getNode(nm.getNodeID()); - + List containerInfoList = nm.pullContainerUpdates(); List newlyLaunchedContainers = new ArrayList(); List completedContainers = new ArrayList(); @@ -1064,12 +1101,12 @@ private synchronized void nodeUpdate(RMNode nm) { newlyLaunchedContainers.addAll(containerInfo.getNewlyLaunchedContainers()); completedContainers.addAll(containerInfo.getCompletedContainers()); } - + // Processing the newly launched containers for (ContainerStatus launchedContainer : newlyLaunchedContainers) { containerLaunchedOnNode(launchedContainer.getContainerId(), node); } - + // Processing the newly increased containers List newlyIncreasedContainers = nm.pullNewlyIncreasedContainers(); @@ -1126,18 +1163,18 @@ private synchronized void nodeUpdate(RMNode nm) { " availableResource: " + node.getUnallocatedResource()); } } - + /** * Process resource update on a node. */ - private synchronized void updateNodeAndQueueResource(RMNode nm, + private synchronized void updateNodeAndQueueResource(RMNode nm, ResourceOption resourceOption) { updateNodeResource(nm, resourceOption); Resource clusterResource = getClusterResource(); root.updateClusterResource(clusterResource, new ResourceLimits( clusterResource)); } - + /** * Process node labels update on a node. */ @@ -1147,7 +1184,7 @@ private synchronized void updateLabelsOnNode(NodeId nodeId, if (null == node) { return; } - + // Get new partition, we have only one partition per node String newPartition; if (newLabels.isEmpty()) { @@ -1174,13 +1211,13 @@ private synchronized void updateLabelsOnNode(NodeId nodeId, continue; } } - + // Unreserve container on this node RMContainer reservedContainer = node.getReservedContainer(); if (null != reservedContainer) { killReservedContainer(reservedContainer); } - + // Update node labels after we've done this node.updateLabels(newLabels); } @@ -1194,56 +1231,34 @@ private void updateSchedulerHealth(long now, FiCaSchedulerNode node, List reservations = assignment.getAssignmentInformation().getReservationDetails(); if (!allocations.isEmpty()) { - ContainerId allocatedContainerId = - allocations.get(allocations.size() - 1).containerId; + ContainerId allocatedContainerId = allocations.get(allocations.size() - 1).containerId; String allocatedQueue = allocations.get(allocations.size() - 1).queue; schedulerHealth.updateAllocation(now, nodeId, allocatedContainerId, - allocatedQueue); + allocatedQueue); } if (!reservations.isEmpty()) { - ContainerId reservedContainerId = - reservations.get(reservations.size() - 1).containerId; + ContainerId reservedContainerId = reservations.get(reservations.size() - 1).containerId; String reservedQueue = reservations.get(reservations.size() - 1).queue; schedulerHealth.updateReservation(now, nodeId, reservedContainerId, - reservedQueue); - } - schedulerHealth.updateSchedulerReservationCounts(assignment - .getAssignmentInformation().getNumReservations()); - schedulerHealth.updateSchedulerAllocationCounts(assignment - .getAssignmentInformation().getNumAllocations()); - schedulerHealth.updateSchedulerRunDetails(now, assignment - .getAssignmentInformation().getAllocated(), assignment - .getAssignmentInformation().getReserved()); - } - - @VisibleForTesting - public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { - if (rmContext.isWorkPreservingRecoveryEnabled() - && !rmContext.isSchedulerReadyForAllocatingContainers()) { - return; - } - - if (!nodeTracker.exists(node.getNodeID())) { - LOG.info("Skipping scheduling as the node " + node.getNodeID() + - " has been removed"); - return; + reservedQueue); } + schedulerHealth.updateSchedulerReservationCounts( + assignment.getAssignmentInformation().getNumReservations()); + schedulerHealth.updateSchedulerAllocationCounts( + assignment.getAssignmentInformation().getNumAllocations()); + schedulerHealth.updateSchedulerRunDetails(now, + assignment.getAssignmentInformation().getAllocated(), + assignment.getAssignmentInformation().getReserved()); + } - // reset allocation and reservation stats before we start doing any work - updateSchedulerHealth(lastNodeUpdateTime, node, - new CSAssignment(Resources.none(), NodeType.NODE_LOCAL)); - - CSAssignment assignment; - - // Assign new containers... - // 1. Check for reserved applications - // 2. Schedule if there are no reservations - + protected synchronized CSAssignment allocateOnReservedNode( + FiCaSchedulerNode node) { RMContainer reservedContainer = node.getReservedContainer(); if (reservedContainer != null) { + CSAssignment assignment; FiCaSchedulerApp reservedApplication = - getCurrentAttemptForContainer(reservedContainer.getContainerId()); + getApplicationAttempt(reservedContainer.getApplicationAttemptId()); // Try to fulfill the reservation LOG.info("Trying to fulfill reservation for application " @@ -1254,12 +1269,16 @@ public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { assignment = queue.assignContainers( getClusterResource(), - node, + new PlacementSet(node, null, node.getPartition()), // TODO, now we only consider limits for parent for non-labeled // resources, should consider labeled resources as well. new ResourceLimits(labelManager.getResourceByLabel( RMNodeLabelsManager.NO_LABEL, getClusterResource())), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + + submitResourceCommitRequest(getClusterResource(), assignment); + + // container to assignment properly if (assignment.isFulfilledReservation()) { CSAssignment tmp = new CSAssignment(reservedContainer.getReservedResource(), @@ -1267,7 +1286,7 @@ public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { Resources.addTo(assignment.getAssignmentInformation().getAllocated(), reservedContainer.getReservedResource()); tmp.getAssignmentInformation().addAllocationDetails( - reservedContainer.getContainerId(), queue.getQueuePath()); + reservedContainer, queue.getQueuePath()); tmp.getAssignmentInformation().incrAllocations(); updateSchedulerHealth(lastNodeUpdateTime, node, tmp); schedulerHealth.updateSchedulerFulfilledReservationCounts(1); @@ -1285,10 +1304,41 @@ public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager, node, reservedContainer.getContainerId(), AllocationState.SKIPPED); } + + return assignment; + } else { + return null; + } + } + + @VisibleForTesting + public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { + if (rmContext.isWorkPreservingRecoveryEnabled() + && !rmContext.isSchedulerReadyForAllocatingContainers()) { + return; + } + + if (!nodeTracker.exists(node.getNodeID())) { + LOG.info("Skipping scheduling as the node " + node.getNodeID() + + " has been removed"); + return; + } + + // reset allocation and reservation stats before we start doing any work + updateSchedulerHealth(lastNodeUpdateTime, node, + new CSAssignment(Resources.none(), NodeType.NODE_LOCAL)); + + // Assign new containers... + // 1. Check for reserved applications + // 2. Schedule if there are no reservations + if (node.getReservedContainer() != null) { + allocateOnReservedNode(node); } // Try to schedule more if there are no reservations to fulfill if (node.getReservedContainer() == null) { + CSAssignment assignment; + if (calculator.computeAvailableContainers(Resources .add(node.getUnallocatedResource(), node.getTotalKillableResources()), minimumAllocation) > 0) { @@ -1300,22 +1350,28 @@ public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { assignment = root.assignContainers( getClusterResource(), - node, + new PlacementSet<>(node, ImmutableMap.of(node.getNodeID(), node), + node.getPartition()), new ResourceLimits(labelManager.getResourceByLabel( node.getPartition(), getClusterResource())), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + assignment.setSchedulingMode( + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + + submitResourceCommitRequest(getClusterResource(), assignment); + if (Resources.greaterThan(calculator, getClusterResource(), assignment.getResource(), Resources.none())) { updateSchedulerHealth(lastNodeUpdateTime, node, assignment); return; } - + // Only do non-exclusive allocation when node has node-labels. if (StringUtils.equals(node.getPartition(), RMNodeLabelsManager.NO_LABEL)) { return; } - + // Only do non-exclusive allocation when the node-label supports that try { if (rmContext.getNodeLabelManager().isExclusiveNodeLabel( @@ -1327,24 +1383,31 @@ public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { + node.getPartition(), e); return; } - + // Try to use NON_EXCLUSIVE assignment = root.assignContainers( getClusterResource(), - node, + new PlacementSet<>(node, ImmutableMap.of(node.getNodeID(), node), + node.getPartition()), // TODO, now we only consider limits for parent for non-labeled // resources, should consider labeled resources as well. new ResourceLimits(labelManager.getResourceByLabel( RMNodeLabelsManager.NO_LABEL, getClusterResource())), SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY); + + // TODO, submit resource commit request + assignment.setSchedulingMode( + SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY); + + submitResourceCommitRequest(getClusterResource(), assignment); + updateSchedulerHealth(lastNodeUpdateTime, node, assignment); } } else { LOG.info("Skipping scheduling since node " + node.getNodeID() + " is reserved by application " - + node.getReservedContainer().getContainerId() - .getApplicationAttemptId()); + + node.getReservedContainer().getApplicationAttemptId()); } } @@ -1367,7 +1430,7 @@ public void handle(SchedulerEvent event) { break; case NODE_RESOURCE_UPDATE: { - NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent = + NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent = (NodeResourceUpdateSchedulerEvent)event; updateNodeAndQueueResource(nodeResourceUpdatedEvent.getRMNode(), nodeResourceUpdatedEvent.getResourceOption()); @@ -1377,7 +1440,7 @@ public void handle(SchedulerEvent event) { { NodeLabelsUpdateSchedulerEvent labelUpdateEvent = (NodeLabelsUpdateSchedulerEvent) event; - + for (Entry> entry : labelUpdateEvent .getUpdatedNodeToLabels().entrySet()) { NodeId id = entry.getKey(); @@ -1392,7 +1455,8 @@ public void handle(SchedulerEvent event) { RMNode node = nodeUpdatedEvent.getRMNode(); setLastNodeUpdateTime(Time.now()); nodeUpdate(node); - if (!scheduleAsynchronously) { + + if (!scheduleAsynchronously && !scheduleGlobally) { ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager, node.getNodeID()); allocateContainersToNode(getNode(node.getNodeID())); @@ -1445,7 +1509,7 @@ public void handle(SchedulerEvent event) { break; case CONTAINER_EXPIRED: { - ContainerExpiredSchedulerEvent containerExpiredEvent = + ContainerExpiredSchedulerEvent containerExpiredEvent = (ContainerExpiredSchedulerEvent) event; ContainerId containerId = containerExpiredEvent.getContainerId(); if (containerExpiredEvent.isIncrease()) { @@ -1512,7 +1576,7 @@ private synchronized void addNode(RMNode nodeManager) { root.updateClusterResource(clusterResource, new ResourceLimits( clusterResource)); - LOG.info("Added node " + nodeManager.getNodeAddress() + + LOG.info("Added node " + nodeManager.getNodeAddress() + " clusterResource: " + clusterResource); if (scheduleAsynchronously && getNumClusterNodes() == 1) { @@ -1538,18 +1602,18 @@ private synchronized void removeNode(RMNode nodeInfo) { for (RMContainer container : runningContainers) { super.completedContainer(container, SchedulerUtils.createAbnormalContainerStatus( - container.getContainerId(), - SchedulerUtils.LOST_CONTAINER), + container.getContainerId(), + SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); } - + // Remove reservations, if any RMContainer reservedContainer = node.getReservedContainer(); if (reservedContainer != null) { super.completedContainer(reservedContainer, SchedulerUtils.createAbnormalContainerStatus( - reservedContainer.getContainerId(), - SchedulerUtils.LOST_CONTAINER), + reservedContainer.getContainerId(), + SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); } @@ -1563,7 +1627,7 @@ private synchronized void removeNode(RMNode nodeInfo) { asyncSchedulerThread.suspendSchedule(); } - LOG.info("Removed node " + nodeInfo.getNodeAddress() + + LOG.info("Removed node " + nodeInfo.getNodeAddress() + " clusterResource: " + getClusterResource()); } @@ -1599,7 +1663,7 @@ protected void completedContainerInternal( RMContainerEventType event) { Container container = rmContainer.getContainer(); ContainerId containerId = container.getId(); - + // Get the application for the finished container FiCaSchedulerApp application = getCurrentAttemptForContainer(container.getId()); @@ -1610,16 +1674,16 @@ protected void completedContainerInternal( + appId + " completed with event " + event); return; } - + // Get the node on which the container was allocated FiCaSchedulerNode node = getNode(container.getNodeId()); - + // Inform the queue LeafQueue queue = (LeafQueue)application.getQueue(); queue.completedContainer(getClusterResource(), application, node, rmContainer, containerStatus, event, null, true); } - + @Override protected void decreaseContainer(SchedContainerChangeRequest decreaseRequest, SchedulerApplicationAttempt attempt) { @@ -1663,7 +1727,7 @@ public FiCaSchedulerNode getNode(NodeId nodeId) { public List getAllNodes() { return nodeTracker.getAllNodes(); } - + @Override @Lock(Lock.NoLock.class) public void recover(RMState state) throws Exception { @@ -2000,7 +2064,7 @@ private LeafQueue getAndCheckLeafQueue(String queue) throws YarnException { } return EnumSet.of(SchedulerResourceTypes.MEMORY, SchedulerResourceTypes.CPU); } - + @Override public Resource getMaximumResourceCapability(String queueName) { CSQueue queue = getQueue(queueName); @@ -2160,4 +2224,152 @@ public PreemptionManager getPreemptionManager() { public ResourceUsage getClusterResourceUsage() { return root.getQueueResourceUsage(); } + + RMNodeLabelsManager getNodeLabelsManager() { + return labelManager; + } + + private SchedulerContainer getSchedulerContainer( + RMContainer rmContainer) { + return getSchedulerContainer(rmContainer, false); + } + + private SchedulerContainer getSchedulerContainer( + RMContainer rmContainer, boolean allocated) { + if (null == rmContainer) { return null; } + + FiCaSchedulerApp app = getApplicationAttempt( + rmContainer.getApplicationAttemptId()); + if (null == app) { return null; } + + NodeId nodeId; + if (rmContainer.getState() != RMContainerState.NEW) { + allocated = rmContainer.getState() != RMContainerState.RESERVED; + nodeId = allocated ? + rmContainer.getAllocatedNode() : + rmContainer.getReservedNode(); + } else { + nodeId = rmContainer.getNodeId(); + } + + FiCaSchedulerNode node = getNode(nodeId); + if (null == node) { return null; } + return new SchedulerContainer<>(app, node, rmContainer, + // TODO, node partition should come from CSAssignment + node.getPartition(), allocated); + } + + private List> + getSchedulerContainersToRelease(CSAssignment csAssignment) { + List> list = null; + + if (csAssignment.getContainersToKill() != null && !csAssignment + .getContainersToKill().isEmpty()) { + list = new ArrayList<>(); + for (RMContainer rmContainer : csAssignment.getContainersToKill()) { + list.add(getSchedulerContainer(rmContainer)); + } + } + + if (csAssignment.getExcessReservation() != null) { + if (null == list) { + list = new ArrayList<>(); + } + list.add(getSchedulerContainer(csAssignment.getExcessReservation())); + } + + return list == null ? Collections.emptyList() : list; + } + + public void submitResourceCommitRequest( + Resource clusterResource, CSAssignment csAssignment) { + ContainerAllocationContext allocated = + null; + ContainerAllocationContext reserved = + null; + List> released = + null; + + if (Resources.greaterThan(calculator, clusterResource, + csAssignment.getResource(), Resources.none())) { + // Allocated something + List allocations = + csAssignment.getAssignmentInformation().getAllocationDetails(); + if (!allocations.isEmpty()) { + RMContainer rmContainer = allocations.get(0).rmContainer; + allocated = new ContainerAllocationContext<>( + getSchedulerContainer(rmContainer, true), + getSchedulerContainersToRelease(csAssignment), + getSchedulerContainer(csAssignment.getFulfilledReservedContainer()), + csAssignment.isIncreasedAllocation(), csAssignment.getType(), + csAssignment.getSchedulingMode(), csAssignment.getResource()); + } + + // Reserved something + List reservation = + csAssignment.getAssignmentInformation().getReservationDetails(); + if (!reservation.isEmpty()) { + RMContainer rmContainer = reservation.get(0).rmContainer; + reserved = new ContainerAllocationContext<>( + getSchedulerContainer(rmContainer, false), + getSchedulerContainersToRelease(csAssignment), + getSchedulerContainer(csAssignment.getFulfilledReservedContainer()), + csAssignment.isIncreasedAllocation(), csAssignment.getType(), + csAssignment.getSchedulingMode(), csAssignment.getResource()); + } + } + + if (null == allocated && null == reserved) { + released = getSchedulerContainersToRelease(csAssignment); + } + + if (null != allocated || null != reserved || null != released) { + List> allocationsList = null; + if (allocated != null) { + allocationsList = new ArrayList<>(); + allocationsList.add(allocated); + } + + List> reservationsList = null; + if (reserved != null) { + reservationsList = new ArrayList<>(); + reservationsList.add(reserved); + } + + resourceCommitRequestHandler.handle( + new ResourceCommitRequest<>( + allocationsList, reservationsList, released)); + } + } + + public synchronized void processResourceCommitRequest( + ResourceCommitRequest r) { + ResourceCommitRequest request = + (ResourceCommitRequest) r; + + ApplicationAttemptId attemptId = null; + + // find the application to accept and apply the ResourceCommitRequest + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationContext c = + request.getFirstAllocatedOrReservedContainer(); + attemptId = + c.getAllocatedOrReservedContainer().getSchedulerApplicationAttempt() + .getApplicationAttemptId(); + } else { + if (!request.getContainersToRelease().isEmpty()) { + attemptId = request.getContainersToRelease().get(0) + .getSchedulerApplicationAttempt().getApplicationAttemptId(); + } + } + + if (attemptId != null) { + FiCaSchedulerApp app = getApplicationAttempt(attemptId); + if (app != null) { + if (app.acceptResourceCommitRequest(getClusterResource(), request)) { + app.applyResourceCommitRequest(getClusterResource(), request); + } + } + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java index d5d1374..f476d8d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java @@ -202,6 +202,14 @@ SCHEDULE_ASYNCHRONOUSLY_PREFIX + ".enable"; @Private + public static final String SCHEDULE_GLOBALLY_PREFIX = + PREFIX + "schedule-globally"; + + @Private + public static final String SCHEDULE_GLOBALLY_ENABLE = + SCHEDULE_GLOBALLY_PREFIX + ".enable"; + + @Private public static final boolean DEFAULT_SCHEDULE_ASYNCHRONOUSLY_ENABLE = false; @Private diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java index c41a7bf..0688e6d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java @@ -83,4 +83,6 @@ ResourceUsage getClusterResourceUsage(); ActivitiesManager getActivitiesManager(); + + boolean globalSchedulingEnabled(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/GlobalSchedulingThread.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/GlobalSchedulingThread.java new file mode 100644 index 0000000..afdea37 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/GlobalSchedulingThread.java @@ -0,0 +1,153 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; +import org.apache.hadoop.yarn.util.resource.Resources; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +public class GlobalSchedulingThread extends Thread { + private static final Log LOG = LogFactory.getLog(CapacityScheduler.class); + + // Schedulable nodes could exclude nodes which don't have available resource, + // Or reserved nodes + private Map schedulableNodes; + private Map reservedNodes; + private Map exhaustedNodes; + + private long lastSyncNodeListVersion = -1; + private long lastRefreshNodeTS = -1; + + boolean scheduleOnReservedNodes = false; + int numContinuousNonReservedNodeScheduled = 0; + + private final CapacityScheduler cs; + + public GlobalSchedulingThread(CapacityScheduler cs) { + setName("=============== Global Scheduling ===================="); + this.cs = cs; + setDaemon(true); + + schedulableNodes = new ConcurrentHashMap<>(); + reservedNodes = new ConcurrentHashMap<>(); + exhaustedNodes = new ConcurrentHashMap<>(); + } + + @Override + public void run() { + // Do we need to refresh nodes in next run? + + while (true) { + java.util.concurrent.locks.Lock readLock = + cs.getNodeTracker().getNodeListReadLock(); + try { + // Lock the node list prevent modifying while doing scheduling + readLock.lock(); + + // Refresh and sync from scheduler when necessary + refreshNodesWhenNecessary(); + + // Do scheduling on cached nodes + if (!schedulableNodes.isEmpty() || !reservedNodes.isEmpty()) { + schedule(); + } + } finally { + readLock.unlock(); + } + + sleep(); + } + } + + private void sleep() { + int sleepTime = 100; + if (cs.getNumClusterNodes() > 0) { + sleepTime = 1000 / cs.getNumClusterNodes(); + sleepTime = Math.max(sleepTime, 1); + } + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + } + } + + private void refreshNodesWhenNecessary() { + long now = System.currentTimeMillis(); + if (lastSyncNodeListVersion != cs.getNodeTracker() + .getNodeListVersion() || now - lastRefreshNodeTS >= 3000) { + // Force refresh nodes + schedulableNodes.clear(); + exhaustedNodes.clear(); + reservedNodes.clear(); + + for (FiCaSchedulerNode node : cs.getAllNodes()) { + if (Resources.lessThanOrEqual(cs.getResourceCalculator(), + cs.getClusterResource(), node.getUnallocatedResource(), + Resources.none())) { + // Exhausted nodes + exhaustedNodes.put(node.getNodeID(), node); + } else if (node.getReservedContainer() != null) { + // Reserved nodes + reservedNodes.put(node.getNodeID(), node); + } else { + // Schedulable nodes (has available resource and not reserved) + schedulableNodes.put(node.getNodeID(), node); + } + schedulableNodes.put(node.getNodeID(), node); + } + + LOG.info( + "refresh node, now schedulable nodes = " + schedulableNodes.size()); + + lastSyncNodeListVersion = cs.getNodeTracker().getNodeListVersion(); + lastRefreshNodeTS = System.currentTimeMillis(); + } + } + + private void schedule() { + if (scheduleOnReservedNodes) { + for (FiCaSchedulerNode node : reservedNodes.values()) { + if (node.getReservedContainer() != null) { + cs.allocateOnReservedNode(node); + } + } + scheduleOnReservedNodes = false; + } else if (!schedulableNodes.isEmpty()) { + CSAssignment assignment = cs.getRootQueue().assignContainers( + cs.getClusterResource(), + new PlacementSet<>(null, schedulableNodes, + RMNodeLabelsManager.NO_LABEL), new ResourceLimits( + cs.getNodeLabelsManager() + .getResourceByLabel(RMNodeLabelsManager.NO_LABEL, + cs.getClusterResource())), + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + cs.submitResourceCommitRequest(cs.getClusterResource(), assignment); + + for (String partition : cs.getRMContext().getNodeLabelManager() + .getClusterNodeLabelNames()) { + assignment = cs.getRootQueue().assignContainers(cs.getClusterResource(), + new PlacementSet<>(null, schedulableNodes, partition), + new ResourceLimits(cs.getNodeLabelsManager() + .getResourceByLabel(partition, cs.getClusterResource())), + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + cs.submitResourceCommitRequest(cs.getClusterResource(), assignment); + } + + // Set if do reserved allocation in next round + numContinuousNonReservedNodeScheduled++; + + if (numContinuousNonReservedNodeScheduled >= schedulableNodes.size()) { + scheduleOnReservedNodes = true; + numContinuousNonReservedNodeScheduled = 0; + } + } + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 6bbe85e..474685f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -18,9 +18,8 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; -import java.io.IOException; -import java.util.*; - +import com.google.common.annotations.VisibleForTesting; +import javafx.scene.Parent; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -52,17 +51,22 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager; - import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.*; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityDiagnosticConstant; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesLogger; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityDiagnosticConstant; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityState; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.KillableContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ContainerAllocationContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ResourceCommitRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.FifoOrderingPolicyForPendingApps; @@ -72,7 +76,18 @@ import org.apache.hadoop.yarn.util.SystemClock; import org.apache.hadoop.yarn.util.resource.Resources; -import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; @Private @Unstable @@ -80,11 +95,11 @@ private static final Log LOG = LogFactory.getLog(LeafQueue.class); private float absoluteUsedCapacity = 0.0f; - private int userLimit; - private float userLimitFactor; + private volatile int userLimit; + private volatile float userLimitFactor; protected int maxApplications; - protected int maxApplicationsPerUser; + protected volatile int maxApplicationsPerUser; private float maxAMResourcePerQueuePercent; @@ -92,15 +107,15 @@ private volatile boolean rackLocalityFullReset; Map applicationAttemptMap = - new HashMap(); + new ConcurrentHashMap<>(); private Priority defaultAppPriorityPerQueue; - private OrderingPolicy pendingOrderingPolicy = null; + private volatile OrderingPolicy pendingOrderingPolicy = null; private volatile float minimumAllocationFactor; - private Map users = new HashMap(); + private ConcurrentMap users = new ConcurrentHashMap<>(); private final RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); @@ -117,7 +132,7 @@ private volatile ResourceLimits cachedResourceLimitsForHeadroom = null; - private OrderingPolicy orderingPolicy = null; + private volatile OrderingPolicy orderingPolicy = null; // record all ignore partition exclusivityRMContainer, this will be used to do // preemption, key is the partition of the RMContainer allocated on @@ -143,125 +158,125 @@ public LeafQueue(CapacitySchedulerContext cs, setupQueueConfigs(cs.getClusterResource()); } - protected synchronized void setupQueueConfigs(Resource clusterResource) + protected void setupQueueConfigs(Resource clusterResource) throws IOException { - super.setupQueueConfigs(clusterResource); - - this.lastClusterResource = clusterResource; - - this.cachedResourceLimitsForHeadroom = new ResourceLimits(clusterResource); - - // Initialize headroom info, also used for calculating application - // master resource limits. Since this happens during queue initialization - // and all queues may not be realized yet, we'll use (optimistic) - // absoluteMaxCapacity (it will be replaced with the more accurate - // absoluteMaxAvailCapacity during headroom/userlimit/allocation events) - setQueueResourceLimitsInfo(clusterResource); + try { + writeLock.lock(); + super.setupQueueConfigs(clusterResource); - CapacitySchedulerConfiguration conf = csContext.getConfiguration(); - - setOrderingPolicy(conf.getOrderingPolicy(getQueuePath())); + this.lastClusterResource = clusterResource; - userLimit = conf.getUserLimit(getQueuePath()); - userLimitFactor = conf.getUserLimitFactor(getQueuePath()); + this.cachedResourceLimitsForHeadroom = new ResourceLimits( + clusterResource); - maxApplications = conf.getMaximumApplicationsPerQueue(getQueuePath()); - if (maxApplications < 0) { - int maxSystemApps = conf.getMaximumSystemApplications(); - maxApplications = - (int) (maxSystemApps * queueCapacities.getAbsoluteCapacity()); - } - maxApplicationsPerUser = Math.min(maxApplications, - (int)(maxApplications * (userLimit / 100.0f) * userLimitFactor)); - - maxAMResourcePerQueuePercent = - conf.getMaximumApplicationMasterResourcePerQueuePercent(getQueuePath()); - - if (!SchedulerUtils.checkQueueLabelExpression( - this.accessibleLabels, this.defaultLabelExpression, null)) { - throw new IOException("Invalid default label expression of " - + " queue=" - + getQueueName() - + " doesn't have permission to access all labels " - + "in default label expression. labelExpression of resource request=" - + (this.defaultLabelExpression == null ? "" - : this.defaultLabelExpression) - + ". Queue labels=" - + (getAccessibleNodeLabels() == null ? "" : StringUtils.join( - getAccessibleNodeLabels().iterator(), ','))); - } - - nodeLocalityDelay = conf.getNodeLocalityDelay(); - rackLocalityFullReset = conf.getRackLocalityFullReset(); + // Initialize headroom info, also used for calculating application + // master resource limits. Since this happens during queue initialization + // and all queues may not be realized yet, we'll use (optimistic) + // absoluteMaxCapacity (it will be replaced with the more accurate + // absoluteMaxAvailCapacity during headroom/userlimit/allocation events) + setQueueResourceLimitsInfo(clusterResource); - // re-init this since max allocation could have changed - this.minimumAllocationFactor = - Resources.ratio(resourceCalculator, - Resources.subtract(maximumAllocation, minimumAllocation), - maximumAllocation); + CapacitySchedulerConfiguration conf = csContext.getConfiguration(); - StringBuilder aclsString = new StringBuilder(); - for (Map.Entry e : acls.entrySet()) { - aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); - } + setOrderingPolicy( + conf.getOrderingPolicy(getQueuePath())); - StringBuilder labelStrBuilder = new StringBuilder(); - if (accessibleLabels != null) { - for (String s : accessibleLabels) { - labelStrBuilder.append(s); - labelStrBuilder.append(","); + userLimit = conf.getUserLimit(getQueuePath()); + userLimitFactor = conf.getUserLimitFactor(getQueuePath()); + + maxApplications = conf.getMaximumApplicationsPerQueue(getQueuePath()); + if (maxApplications < 0) { + int maxSystemApps = conf.getMaximumSystemApplications(); + maxApplications = + (int) (maxSystemApps * queueCapacities.getAbsoluteCapacity()); + } + maxApplicationsPerUser = Math.min(maxApplications, + (int) (maxApplications * (userLimit / 100.0f) * userLimitFactor)); + + maxAMResourcePerQueuePercent = + conf.getMaximumApplicationMasterResourcePerQueuePercent( + getQueuePath()); + + if (!SchedulerUtils.checkQueueLabelExpression(this.accessibleLabels, + this.defaultLabelExpression, null)) { + throw new IOException( + "Invalid default label expression of " + " queue=" + getQueueName() + + " doesn't have permission to access all labels " + + "in default label expression. labelExpression of resource request=" + + (this.defaultLabelExpression == null ? + "" : + this.defaultLabelExpression) + ". Queue labels=" + ( + getAccessibleNodeLabels() == null ? + "" : + StringUtils + .join(getAccessibleNodeLabels().iterator(), ','))); + } + + nodeLocalityDelay = conf.getNodeLocalityDelay(); + rackLocalityFullReset = conf.getRackLocalityFullReset(); + + // re-init this since max allocation could have changed + this.minimumAllocationFactor = Resources.ratio(resourceCalculator, + Resources.subtract(maximumAllocation, minimumAllocation), + maximumAllocation); + + StringBuilder aclsString = new StringBuilder(); + for (Map.Entry e : acls.entrySet()) { + aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); + } + + StringBuilder labelStrBuilder = new StringBuilder(); + if (accessibleLabels != null) { + for (String s : accessibleLabels) { + labelStrBuilder.append(s); + labelStrBuilder.append(","); + } } - } - defaultAppPriorityPerQueue = Priority.newInstance(conf - .getDefaultApplicationPriorityConfPerQueue(getQueuePath())); - - LOG.info("Initializing " + queueName + "\n" + - "capacity = " + queueCapacities.getCapacity() + - " [= (float) configuredCapacity / 100 ]" + "\n" + - "asboluteCapacity = " + queueCapacities.getAbsoluteCapacity() + - " [= parentAbsoluteCapacity * capacity ]" + "\n" + - "maxCapacity = " + queueCapacities.getMaximumCapacity() + - " [= configuredMaxCapacity ]" + "\n" + - "absoluteMaxCapacity = " + queueCapacities.getAbsoluteMaximumCapacity() + - " [= 1.0 maximumCapacity undefined, " + - "(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + - "\n" + - "userLimit = " + userLimit + - " [= configuredUserLimit ]" + "\n" + - "userLimitFactor = " + userLimitFactor + - " [= configuredUserLimitFactor ]" + "\n" + - "maxApplications = " + maxApplications + - " [= configuredMaximumSystemApplicationsPerQueue or" + - " (int)(configuredMaximumSystemApplications * absoluteCapacity)]" + - "\n" + - "maxApplicationsPerUser = " + maxApplicationsPerUser + - " [= (int)(maxApplications * (userLimit / 100.0f) * " + - "userLimitFactor) ]" + "\n" + - "usedCapacity = " + queueCapacities.getUsedCapacity() + - " [= usedResourcesMemory / " + - "(clusterResourceMemory * absoluteCapacity)]" + "\n" + - "absoluteUsedCapacity = " + absoluteUsedCapacity + - " [= usedResourcesMemory / clusterResourceMemory]" + "\n" + - "maxAMResourcePerQueuePercent = " + maxAMResourcePerQueuePercent + - " [= configuredMaximumAMResourcePercent ]" + "\n" + - "minimumAllocationFactor = " + minimumAllocationFactor + - " [= (float)(maximumAllocationMemory - minimumAllocationMemory) / " + - "maximumAllocationMemory ]" + "\n" + - "maximumAllocation = " + maximumAllocation + - " [= configuredMaxAllocation ]" + "\n" + - "numContainers = " + numContainers + - " [= currentNumContainers ]" + "\n" + - "state = " + state + - " [= configuredState ]" + "\n" + - "acls = " + aclsString + - " [= configuredAcls ]" + "\n" + - "nodeLocalityDelay = " + nodeLocalityDelay + "\n" + - "labels=" + labelStrBuilder.toString() + "\n" + - "reservationsContinueLooking = " + - reservationsContinueLooking + "\n" + - "preemptionDisabled = " + getPreemptionDisabled() + "\n" + - "defaultAppPriorityPerQueue = " + defaultAppPriorityPerQueue); + defaultAppPriorityPerQueue = Priority.newInstance( + conf.getDefaultApplicationPriorityConfPerQueue(getQueuePath())); + + LOG.info( + "Initializing " + queueName + "\n" + "capacity = " + queueCapacities + .getCapacity() + " [= (float) configuredCapacity / 100 ]" + "\n" + + "asboluteCapacity = " + queueCapacities.getAbsoluteCapacity() + + " [= parentAbsoluteCapacity * capacity ]" + "\n" + + "maxCapacity = " + queueCapacities.getMaximumCapacity() + + " [= configuredMaxCapacity ]" + "\n" + "absoluteMaxCapacity = " + + queueCapacities.getAbsoluteMaximumCapacity() + + " [= 1.0 maximumCapacity undefined, " + + "(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + + "\n" + "userLimit = " + userLimit + " [= configuredUserLimit ]" + + "\n" + "userLimitFactor = " + userLimitFactor + + " [= configuredUserLimitFactor ]" + "\n" + "maxApplications = " + + maxApplications + + " [= configuredMaximumSystemApplicationsPerQueue or" + + " (int)(configuredMaximumSystemApplications * absoluteCapacity)]" + + "\n" + "maxApplicationsPerUser = " + maxApplicationsPerUser + + " [= (int)(maxApplications * (userLimit / 100.0f) * " + + "userLimitFactor) ]" + "\n" + "usedCapacity = " + + queueCapacities.getUsedCapacity() + " [= usedResourcesMemory / " + + "(clusterResourceMemory * absoluteCapacity)]" + "\n" + + "absoluteUsedCapacity = " + absoluteUsedCapacity + + " [= usedResourcesMemory / clusterResourceMemory]" + "\n" + + "maxAMResourcePerQueuePercent = " + maxAMResourcePerQueuePercent + + " [= configuredMaximumAMResourcePercent ]" + "\n" + + "minimumAllocationFactor = " + minimumAllocationFactor + + " [= (float)(maximumAllocationMemory - minimumAllocationMemory) / " + + "maximumAllocationMemory ]" + "\n" + "maximumAllocation = " + + maximumAllocation + " [= configuredMaxAllocation ]" + "\n" + + "numContainers = " + numContainers + + " [= currentNumContainers ]" + "\n" + "state = " + state + + " [= configuredState ]" + "\n" + "acls = " + aclsString + + " [= configuredAcls ]" + "\n" + "nodeLocalityDelay = " + + nodeLocalityDelay + "\n" + "labels=" + labelStrBuilder + .toString() + "\n" + "reservationsContinueLooking = " + + reservationsContinueLooking + "\n" + "preemptionDisabled = " + + getPreemptionDisabled() + "\n" + "defaultAppPriorityPerQueue = " + + defaultAppPriorityPerQueue); + } finally { + writeLock.unlock(); + } } @Override @@ -289,7 +304,7 @@ public int getMaxApplications() { return maxApplications; } - public synchronized int getMaxApplicationsPerUser() { + public int getMaxApplicationsPerUser() { return maxApplicationsPerUser; } @@ -307,7 +322,7 @@ public ActiveUsersManager getActiveUsersManager() { * Set user limit - used only for testing. * @param userLimit new user limit */ - synchronized void setUserLimit(int userLimit) { + void setUserLimit(int userLimit) { this.userLimit = userLimit; } @@ -315,50 +330,60 @@ synchronized void setUserLimit(int userLimit) { * Set user limit factor - used only for testing. * @param userLimitFactor new user limit factor */ - synchronized void setUserLimitFactor(float userLimitFactor) { + void setUserLimitFactor(float userLimitFactor) { this.userLimitFactor = userLimitFactor; } @Override - public synchronized int getNumApplications() { - return getNumPendingApplications() + getNumActiveApplications(); - } - - public synchronized int getNumPendingApplications() { - return pendingOrderingPolicy.getNumSchedulableEntities(); + public int getNumApplications() { + try { + readLock.lock(); + return getNumPendingApplications() + getNumActiveApplications(); + } finally { + readLock.unlock(); + } } - public synchronized int getNumActiveApplications() { - return orderingPolicy.getNumSchedulableEntities(); + public int getNumPendingApplications() { + try { + readLock.lock(); + return pendingOrderingPolicy.getNumSchedulableEntities(); + } finally { + readLock.unlock(); + } } - @Private - public synchronized int getNumApplications(String user) { - return getUser(user).getTotalApplications(); + public int getNumActiveApplications() { + try { + readLock.lock(); + return orderingPolicy.getNumSchedulableEntities(); + } finally { + readLock.unlock(); + } } @Private - public synchronized int getNumPendingApplications(String user) { - return getUser(user).getPendingApplications(); + public int getNumPendingApplications(String user) { + return getOrDefault(user).getPendingApplications(); } @Private - public synchronized int getNumActiveApplications(String user) { - return getUser(user).getActiveApplications(); + public int getNumActiveApplications(String user) { + return getOrDefault(user).getActiveApplications(); } @Override - public synchronized QueueState getState() { + public QueueState getState() { return state; } @Private - public synchronized int getUserLimit() { + public int getUserLimit() { return userLimit; } @Private - public synchronized float getUserLimitFactor() { + public float getUserLimitFactor() { return userLimitFactor; } @@ -370,20 +395,24 @@ public QueueInfo getQueueInfo( } @Override - public synchronized List - getQueueUserAclInfo(UserGroupInformation user) { - QueueUserACLInfo userAclInfo = - recordFactory.newRecordInstance(QueueUserACLInfo.class); - List operations = new ArrayList(); - for (QueueACL operation : QueueACL.values()) { - if (hasAccess(operation, user)) { - operations.add(operation); + public List getQueueUserAclInfo(UserGroupInformation user) { + try { + readLock.lock(); + QueueUserACLInfo userAclInfo = recordFactory.newRecordInstance( + QueueUserACLInfo.class); + List operations = new ArrayList(); + for (QueueACL operation : QueueACL.values()) { + if (hasAccess(operation, user)) { + operations.add(operation); + } } - } - userAclInfo.setQueueName(getQueueName()); - userAclInfo.setUserAcls(operations); - return Collections.singletonList(userAclInfo); + userAclInfo.setQueueName(getQueueName()); + userAclInfo.setUserAcls(operations); + return Collections.singletonList(userAclInfo); + } finally { + readLock.unlock(); + } } public String toString() { @@ -393,83 +422,87 @@ public String toString() { "usedResources=" + queueUsage.getUsed() + ", " + "usedCapacity=" + getUsedCapacity() + ", " + "absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + ", " + - "numApps=" + getNumApplications() + ", " + - "numContainers=" + getNumContainers(); + "numContainers=" + getNumContainers(); } @VisibleForTesting - public synchronized void setNodeLabelManager(RMNodeLabelsManager mgr) { + public void setNodeLabelManager(RMNodeLabelsManager mgr) { this.labelManager = mgr; } @VisibleForTesting - public synchronized User getUser(String userName) { - User user = users.get(userName); - if (user == null) { - user = new User(); - users.put(userName, user); - } - return user; + public User getOrDefault(String userName) { + return users.getOrDefault(userName, new User()); } /** * @return an ArrayList of UserInfo objects who are active in this queue */ - public synchronized ArrayList getUsers() { - ArrayList usersToReturn = new ArrayList(); - for (Map.Entry entry : users.entrySet()) { - User user = entry.getValue(); - usersToReturn.add(new UserInfo(entry.getKey(), Resources.clone(user - .getAllUsed()), user.getActiveApplications(), user - .getPendingApplications(), Resources.clone(user - .getConsumedAMResources()), Resources.clone(user - .getUserResourceLimit()), user.getResourceUsage())); + public ArrayList getUsers() { + try { + readLock.lock(); + ArrayList usersToReturn = new ArrayList(); + for (Map.Entry entry : users.entrySet()) { + User user = entry.getValue(); + usersToReturn.add(new UserInfo(entry.getKey(), Resources.clone(user.getAllUsed()), + user.getActiveApplications(), user.getPendingApplications(), + Resources.clone(user.getConsumedAMResources()), Resources.clone(user.getUserResourceLimit()), + user.getResourceUsage())); + } + return usersToReturn; + } finally { + readLock.unlock(); } - return usersToReturn; } @Override - public synchronized void reinitialize( - CSQueue newlyParsedQueue, Resource clusterResource) - throws IOException { - // Sanity check - if (!(newlyParsedQueue instanceof LeafQueue) || - !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { - throw new IOException("Trying to reinitialize " + getQueuePath() + - " from " + newlyParsedQueue.getQueuePath()); - } - - LeafQueue newlyParsedLeafQueue = (LeafQueue)newlyParsedQueue; - - // don't allow the maximum allocation to be decreased in size - // since we have already told running AM's the size - Resource oldMax = getMaximumAllocation(); - Resource newMax = newlyParsedLeafQueue.getMaximumAllocation(); - if (newMax.getMemorySize() < oldMax.getMemorySize() - || newMax.getVirtualCores() < oldMax.getVirtualCores()) { - throw new IOException( - "Trying to reinitialize " - + getQueuePath() - + " the maximum allocation size can not be decreased!" - + " Current setting: " + oldMax - + ", trying to set it to: " + newMax); - } - - setupQueueConfigs(clusterResource); - - // queue metrics are updated, more resource may be available - // activate the pending applications if possible - activateApplications(); + public void reinitialize( + CSQueue newlyParsedQueue, Resource clusterResource) throws IOException { + try { + writeLock.lock(); + // Sanity check + if (!(newlyParsedQueue instanceof LeafQueue) || !newlyParsedQueue + .getQueuePath().equals(getQueuePath())) { + throw new IOException( + "Trying to reinitialize " + getQueuePath() + " from " + + newlyParsedQueue.getQueuePath()); + } + + LeafQueue newlyParsedLeafQueue = (LeafQueue) newlyParsedQueue; + + // don't allow the maximum allocation to be decreased in size + // since we have already told running AM's the size + Resource oldMax = getMaximumAllocation(); + Resource newMax = newlyParsedLeafQueue.getMaximumAllocation(); + if (newMax.getMemorySize() < oldMax.getMemorySize() + || newMax.getVirtualCores() < oldMax.getVirtualCores()) { + throw new IOException("Trying to reinitialize " + getQueuePath() + + " the maximum allocation size can not be decreased!" + + " Current setting: " + oldMax + ", trying to set it to: " + + newMax); + } + + setupQueueConfigs(clusterResource); + + // queue metrics are updated, more resource may be available + // activate the pending applications if possible + activateApplications(); + } finally { + writeLock.unlock(); + } } @Override public void submitApplicationAttempt(FiCaSchedulerApp application, String userName) { // Careful! Locking order is important! - synchronized (this) { - User user = getUser(userName); + try { + writeLock.lock(); + User user = getOrDefault(userName); // Add the attempt to our data-structures addApplicationAttempt(application, user); + } finally { + writeLock.unlock(); } // We don't want to update metrics for move app @@ -484,9 +517,8 @@ public void submitApplication(ApplicationId applicationId, String userName, String queue) throws AccessControlException { // Careful! Locking order is important! - User user = null; - synchronized (this) { - + try { + writeLock.lock(); // Check if the queue is accepting jobs if (getState() != QueueState.RUNNING) { String msg = "Queue " + getQueuePath() + @@ -505,7 +537,11 @@ public void submitApplication(ApplicationId applicationId, String userName, } // Check submission limits for the user on this queue - user = getUser(userName); + User user = users.get(userName); + if (null == user) { + user = new User(); + users.put(userName, user); + } if (user.getTotalApplications() >= getMaxApplicationsPerUser()) { String msg = "Queue " + getQueuePath() + " already has " + user.getTotalApplications() + @@ -514,6 +550,8 @@ public void submitApplication(ApplicationId applicationId, String userName, LOG.info(msg); throw new AccessControlException(msg); } + } finally { + writeLock.unlock(); } // Inform the parent queue @@ -535,17 +573,27 @@ public Resource getAMResourceLimitPerPartition(String nodePartition) { return queueUsage.getAMLimit(nodePartition); } - public synchronized Resource calculateAndGetAMResourceLimit() { - return calculateAndGetAMResourceLimitPerPartition( - RMNodeLabelsManager.NO_LABEL); + public Resource calculateAndGetAMResourceLimit() { + try { + readLock.lock(); + return calculateAndGetAMResourceLimitPerPartition( + RMNodeLabelsManager.NO_LABEL); + } finally { + readLock.unlock(); + } } @VisibleForTesting - public synchronized Resource getUserAMResourceLimit() { - return getUserAMResourceLimitPerPartition(RMNodeLabelsManager.NO_LABEL); + public Resource getUserAMResourceLimit() { + try { + readLock.lock(); + return getUserAMResourceLimitPerPartition(RMNodeLabelsManager.NO_LABEL); + } finally { + readLock.unlock(); + } } - public synchronized Resource getUserAMResourceLimitPerPartition( + public Resource getUserAMResourceLimitPerPartition( String nodePartition) { /* * The user am resource limit is based on the same approach as the user @@ -571,7 +619,7 @@ public synchronized Resource getUserAMResourceLimitPerPartition( : getAMResourceLimitPerPartition(nodePartition); } - public synchronized Resource calculateAndGetAMResourceLimitPerPartition( + public Resource calculateAndGetAMResourceLimitPerPartition( String nodePartition) { /* * For non-labeled partition, get the max value from resources currently @@ -612,7 +660,7 @@ public synchronized Resource calculateAndGetAMResourceLimitPerPartition( return amResouceLimit; } - private synchronized void activateApplications() { + private void activateApplications() { // limit of allowed resource usage for application masters Map userAmPartitionLimit = new HashMap(); @@ -674,7 +722,7 @@ private synchronized void activateApplications() { } // Check user am resource limit - User user = getUser(application.getUser()); + User user = getOrDefault(application.getUser()); Resource userAMLimit = userAmPartitionLimit.get(partitionName); // Verify whether we already calculated user-am-limit for this label. @@ -724,7 +772,7 @@ private synchronized void activateApplications() { } } - private synchronized void addApplicationAttempt(FiCaSchedulerApp application, + private void addApplicationAttempt(FiCaSchedulerApp application, User user) { // Accept user.submitApplication(); @@ -756,13 +804,16 @@ public void finishApplication(ApplicationId application, String user) { @Override public void finishApplicationAttempt(FiCaSchedulerApp application, String queue) { // Careful! Locking order is important! - synchronized (this) { - removeApplicationAttempt(application, getUser(application.getUser())); + try { + writeLock.lock(); + removeApplicationAttempt(application, getOrDefault(application.getUser())); + } finally { + writeLock.unlock(); } getParent().finishApplicationAttempt(application, queue); } - public synchronized void removeApplicationAttempt( + private void removeApplicationAttempt( FiCaSchedulerApp application, User user) { String partitionName = application.getAppAMNodePartitionName(); boolean wasActive = @@ -797,15 +848,18 @@ public synchronized void removeApplicationAttempt( ); } - private synchronized FiCaSchedulerApp getApplication( + private FiCaSchedulerApp getApplication( ApplicationAttemptId applicationAttemptId) { return applicationAttemptMap.get(applicationAttemptId); } private void handleExcessReservedContainer(Resource clusterResource, - CSAssignment assignment, FiCaSchedulerNode node, FiCaSchedulerApp app) { + CSAssignment assignment) { if (assignment.getExcessReservation() != null) { RMContainer excessReservedContainer = assignment.getExcessReservation(); + FiCaSchedulerNode node = csContext.getNode( + excessReservedContainer.getReservedNode()); + FiCaSchedulerApp app = assignment.getApplication(); if (excessReservedContainer.hasIncreaseReservation()) { unreserveIncreasedContainer(clusterResource, @@ -825,7 +879,6 @@ private void handleExcessReservedContainer(Resource clusterResource, } private void killToPreemptContainers(Resource clusterResource, - FiCaSchedulerNode node, CSAssignment assignment) { if (assignment.getContainersToKill() != null) { StringBuilder sb = new StringBuilder("Killing containers: ["); @@ -834,6 +887,7 @@ private void killToPreemptContainers(Resource clusterResource, FiCaSchedulerApp application = csContext.getApplicationAttempt( c.getApplicationAttemptId()); LeafQueue q = application.getCSLeafQueue(); + FiCaSchedulerNode node = csContext.getNode(c.getAllocatedNode()); q.completedContainer(clusterResource, application, node, c, SchedulerUtils .createPreemptedContainerStatus(c.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL, @@ -858,42 +912,75 @@ private void setPreemptionAllowed(ResourceLimits limits, String nodePartition) { limits.setIsAllowPreemption(usedCapacity < guaranteedCapacity); } - @Override - public synchronized CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, ResourceLimits currentResourceLimits, - SchedulingMode schedulingMode) { - updateCurrentResourceLimits(currentResourceLimits, clusterResource); - - if (LOG.isDebugEnabled()) { - LOG.debug("assignContainers: node=" + node.getNodeName() - + " #applications=" + orderingPolicy.getNumSchedulableEntities()); + private CSAssignment handleReservedContainer( + Resource clusterResource, PlacementSet placementSet, + ResourceLimits currentResourceLimits, SchedulingMode schedulingMode) { + FiCaSchedulerNode node = + (FiCaSchedulerNode) placementSet.getNextAvailable(); + if (null == node) { + return null; } - setPreemptionAllowed(currentResourceLimits, node.getPartition()); - - // Check for reserved resources RMContainer reservedContainer = node.getReservedContainer(); if (reservedContainer != null) { - FiCaSchedulerApp application = - getApplication(reservedContainer.getApplicationAttemptId()); + FiCaSchedulerApp application = getApplication( + reservedContainer.getApplicationAttemptId()); ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager, - node.getNodeID(), SystemClock.getInstance().getTime(), application); - + node, SystemClock.getInstance().getTime(), application); synchronized (application) { - CSAssignment assignment = - application.assignContainers(clusterResource, node, - currentResourceLimits, schedulingMode, reservedContainer); - handleExcessReservedContainer(clusterResource, assignment, node, - application); - killToPreemptContainers(clusterResource, node, assignment); + CSAssignment assignment = application.assignContainers(clusterResource, + placementSet, currentResourceLimits, schedulingMode, + reservedContainer); + /* Handled by applyCommitRequest + handleExcessReservedContainer(clusterResource, assignment); + killToPreemptContainers(clusterResource, assignment); + */ return assignment; } } + return null; + } + + @Override + public CSAssignment assignContainers(Resource clusterResource, + PlacementSet placementSet, ResourceLimits currentResourceLimits, + SchedulingMode schedulingMode) { + try { + readLock.lock(); + return internalAssignContainers(clusterResource, placementSet, + currentResourceLimits, schedulingMode); + } finally { + readLock.unlock(); + } + } + + private CSAssignment internalAssignContainers(Resource clusterResource, + PlacementSet placementSet, ResourceLimits currentResourceLimits, + SchedulingMode schedulingMode) { + updateCurrentResourceLimits(currentResourceLimits, clusterResource); + SchedulerNode node = placementSet.getNextAvailable(); + + if (LOG.isDebugEnabled()) { + LOG.debug("assignContainers: nodePartition=" + placementSet + .getPartition() + " #applications=" + orderingPolicy + .getNumSchedulableEntities()); + } + + setPreemptionAllowed(currentResourceLimits, + placementSet.getPartition()); + + // Check for reserved resources= + CSAssignment assignment = handleReservedContainer(clusterResource, + placementSet, currentResourceLimits, schedulingMode); + if (null != assignment) { + return assignment; + } + // if our queue cannot access this node, just return if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY - && !accessibleToPartition(node.getPartition())) { + && !accessibleToPartition(placementSet.getPartition())) { ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, getParent().getQueueName(), getQueueName(), ActivityState.REJECTED, ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node @@ -903,12 +990,12 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, // Check if this queue need more resource, simply skip allocation if this // queue doesn't need more resources. - if (!hasPendingResourceRequest(node.getPartition(), clusterResource, - schedulingMode)) { + if (!hasPendingResourceRequest(placementSet.getPartition(), + clusterResource, schedulingMode)) { if (LOG.isDebugEnabled()) { LOG.debug("Skip this queue=" + getQueuePath() + ", because it doesn't need more resource, schedulingMode=" - + schedulingMode.name() + " node-partition=" + node.getPartition()); + + schedulingMode.name() + " node-partition=" + placementSet.getPartition()); } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, @@ -916,15 +1003,16 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, return CSAssignment.NULL_ASSIGNMENT; } + // TODO: ordering policy cannot work properly under multi threading env for (Iterator assignmentIterator = orderingPolicy.getAssignmentIterator(); assignmentIterator.hasNext();) { FiCaSchedulerApp application = assignmentIterator.next(); ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager, - node.getNodeID(), SystemClock.getInstance().getTime(), application); + node, SystemClock.getInstance().getTime(), application); // Check queue max-capacity limit - if (!super.canAssignToThisQueue(clusterResource, node.getPartition(), + if (!super.canAssignToThisQueue(clusterResource, placementSet.getPartition(), currentResourceLimits, application.getCurrentReservation(), schedulingMode)) { ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue( @@ -939,11 +1027,11 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, Resource userLimit = computeUserLimitAndSetHeadroom(application, clusterResource, - node.getPartition(), schedulingMode); + placementSet.getPartition(), schedulingMode); // Check user limit if (!canAssignToUser(clusterResource, application.getUser(), userLimit, - application, node.getPartition(), currentResourceLimits)) { + application, placementSet.getPartition(), currentResourceLimits)) { application.updateAMContainerDiagnostics(AMState.ACTIVATED, "User capacity has reached its maximum limit."); ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue( @@ -954,9 +1042,7 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, } // Try to schedule - CSAssignment assignment = - application.assignContainers(clusterResource, node, - currentResourceLimits, schedulingMode, null); + assignment = application.assignContainers(clusterResource, placementSet, currentResourceLimits, schedulingMode, null); if (LOG.isDebugEnabled()) { LOG.debug("post-assignContainers for application " @@ -964,15 +1050,18 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, application.showRequests(); } + // Did we schedule or reserve a container? Resource assigned = assignment.getResource(); - - handleExcessReservedContainer(clusterResource, assignment, node, - application); - killToPreemptContainers(clusterResource, node, assignment); + + /* Handled by applyCommitRequest + handleExcessReservedContainer(clusterResource, assignment); + killToPreemptContainers(clusterResource, assignment); + */ if (Resources.greaterThan(resourceCalculator, clusterResource, assigned, Resources.none())) { + /* Handled by applyCommitRequest // Get reserved or allocated container from application RMContainer reservedOrAllocatedRMContainer = application.getRMContainer(assignment.getAssignmentInformation() @@ -981,14 +1070,14 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, // Book-keeping // Note: Update headroom to account for current allocation too... allocateResource(clusterResource, application, assigned, - node.getPartition(), reservedOrAllocatedRMContainer, + placementSet.getPartition(), reservedOrAllocatedRMContainer, assignment.isIncreasedAllocation()); // Update reserved metrics Resource reservedRes = assignment.getAssignmentInformation() .getReserved(); if (reservedRes != null && !reservedRes.equals(Resources.none())) { - incReservedResource(node.getPartition(), reservedRes); + incReservedResource(placementSet.getPartition(), reservedRes); } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, @@ -996,13 +1085,14 @@ public synchronized CSAssignment assignContainers(Resource clusterResource, ActivityDiagnosticConstant.EMPTY); // Done + */ return assignment; } else if (assignment.getSkippedType() == CSAssignment.SkippedType.OTHER) { ActivitiesLogger.APP.finishSkippedAppAllocationRecording( activitiesManager, application.getApplicationId(), ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY); - application.updateNodeInfoForAMDiagnostics(node); + application.updateNodeInfoForAMDiagnostics(placementSet); } else if(assignment.getSkippedType() == CSAssignment.SkippedType.QUEUE_LIMIT) { return assignment; @@ -1102,7 +1192,7 @@ Resource computeUserLimitAndSetHeadroom(FiCaSchedulerApp application, Resource clusterResource, String nodePartition, SchedulingMode schedulingMode) { String user = application.getUser(); - User queueUser = getUser(user); + User queueUser = getOrDefault(user); // Compute user limit respect requested labels, // TODO, need consider headroom respect labels also @@ -1247,51 +1337,49 @@ private Resource computeUserLimit(FiCaSchedulerApp application, } @Private - protected synchronized boolean canAssignToUser(Resource clusterResource, + protected boolean canAssignToUser(Resource clusterResource, String userName, Resource limit, FiCaSchedulerApp application, String nodePartition, ResourceLimits currentResourceLimits) { - User user = getUser(userName); - - currentResourceLimits.setAmountNeededUnreserve(Resources.none()); - - // Note: We aren't considering the current request since there is a fixed - // overhead of the AM, but it's a > check, not a >= check, so... - if (Resources - .greaterThan(resourceCalculator, clusterResource, - user.getUsed(nodePartition), - limit)) { - // if enabled, check to see if could we potentially use this node instead - // of a reserved node if the application has reserved containers - if (this.reservationsContinueLooking && - nodePartition.equals(CommonNodeLabelsManager.NO_LABEL)) { - if (Resources.lessThanOrEqual( - resourceCalculator, - clusterResource, - Resources.subtract(user.getUsed(), - application.getCurrentReservation()), limit)) { - - if (LOG.isDebugEnabled()) { - LOG.debug("User " + userName + " in queue " + getQueueName() - + " will exceed limit based on reservations - " + " consumed: " - + user.getUsed() + " reserved: " - + application.getCurrentReservation() + " limit: " + limit); + try { + readLock.lock(); + User user = getOrDefault(userName); + + currentResourceLimits.setAmountNeededUnreserve(Resources.none()); + + // Note: We aren't considering the current request since there is a fixed + // overhead of the AM, but it's a > check, not a >= check, so... + if (Resources.greaterThan(resourceCalculator, clusterResource, user.getUsed(nodePartition), + limit)) { + // if enabled, check to see if could we potentially use this node instead + // of a reserved node if the application has reserved containers + if (this.reservationsContinueLooking && nodePartition.equals( + CommonNodeLabelsManager.NO_LABEL)) { + if (Resources.lessThanOrEqual(resourceCalculator, clusterResource, + Resources.subtract(user.getUsed(), application.getCurrentReservation()), limit)) { + + if (LOG.isDebugEnabled()) { + LOG.debug("User " + userName + " in queue " + getQueueName() + " will exceed limit based on reservations - " + + " consumed: " + user.getUsed() + " reserved: " + application + .getCurrentReservation() + " limit: " + limit); + } + Resource amountNeededToUnreserve = Resources.subtract(user.getUsed(nodePartition), limit); + // we can only acquire a new container if we unreserve first to + // respect user-limit + currentResourceLimits.setAmountNeededUnreserve( + amountNeededToUnreserve); + return true; } - Resource amountNeededToUnreserve = - Resources.subtract(user.getUsed(nodePartition), limit); - // we can only acquire a new container if we unreserve first to - // respect user-limit - currentResourceLimits.setAmountNeededUnreserve(amountNeededToUnreserve); - return true; } + if (LOG.isDebugEnabled()) { + LOG.debug("User " + userName + " in queue " + getQueueName() + " will exceed limit - " + " consumed: " + + user.getUsed(nodePartition) + " limit: " + limit); + } + return false; } - if (LOG.isDebugEnabled()) { - LOG.debug("User " + userName + " in queue " + getQueueName() - + " will exceed limit - " + " consumed: " - + user.getUsed(nodePartition) + " limit: " + limit); - } - return false; + return true; + } finally { + readLock.unlock(); } - return true; } @Override @@ -1300,7 +1388,8 @@ public void unreserveIncreasedContainer(Resource clusterResource, boolean removed = false; Priority priority = null; - synchronized (this) { + try { + writeLock.lock(); if (rmContainer.getContainer() != null) { priority = rmContainer.getContainer().getPriority(); } @@ -1318,6 +1407,8 @@ public void unreserveIncreasedContainer(Resource clusterResource, releaseResource(clusterResource, app, rmContainer.getReservedResource(), node.getPartition(), rmContainer, true); } + } finally { + writeLock.unlock(); } if (removed) { @@ -1368,8 +1459,8 @@ public void completedContainer(Resource clusterResource, boolean removed = false; // Careful! Locking order is important! - synchronized (this) { - + try { + writeLock.lock(); Container container = rmContainer.getContainer(); // Inform the application & the node @@ -1396,6 +1487,8 @@ public void completedContainer(Resource clusterResource, releaseResource(clusterResource, application, container.getResource(), node.getPartition(), rmContainer, false); } + } finally { + writeLock.unlock(); } if (removed) { @@ -1410,75 +1503,84 @@ public void completedContainer(Resource clusterResource, new KillableContainer(rmContainer, node.getPartition(), queueName)); } - synchronized void allocateResource(Resource clusterResource, + void allocateResource(Resource clusterResource, SchedulerApplicationAttempt application, Resource resource, String nodePartition, RMContainer rmContainer, boolean isIncreasedAllocation) { - super.allocateResource(clusterResource, resource, nodePartition, - isIncreasedAllocation); - - // handle ignore exclusivity container - if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( - RMNodeLabelsManager.NO_LABEL) - && !nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { - TreeSet rmContainers = null; - if (null == (rmContainers = - ignorePartitionExclusivityRMContainers.get(nodePartition))) { - rmContainers = new TreeSet<>(); - ignorePartitionExclusivityRMContainers.put(nodePartition, rmContainers); + try { + writeLock.lock(); + super.allocateResource(clusterResource, resource, nodePartition, + isIncreasedAllocation); + + // handle ignore exclusivity container + if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( + RMNodeLabelsManager.NO_LABEL) && !nodePartition.equals( + RMNodeLabelsManager.NO_LABEL)) { + TreeSet rmContainers = null; + if (null == (rmContainers = ignorePartitionExclusivityRMContainers.get( + nodePartition))) { + rmContainers = new TreeSet<>(); + ignorePartitionExclusivityRMContainers.put(nodePartition, + rmContainers); + } + rmContainers.add(rmContainer); } - rmContainers.add(rmContainer); - } - // Update user metrics - String userName = application.getUser(); - User user = getUser(userName); - user.assignContainer(resource, nodePartition); - // Note this is a bit unconventional since it gets the object and modifies - // it here, rather then using set routine - Resources.subtractFrom(application.getHeadroom(), resource); // headroom - metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); - - if (LOG.isDebugEnabled()) { - LOG.debug(getQueueName() + - " user=" + userName + - " used=" + queueUsage.getUsed() + " numContainers=" + numContainers + - " headroom = " + application.getHeadroom() + - " user-resources=" + user.getUsed() - ); + // Update user metrics + String userName = application.getUser(); + User user = getOrDefault(userName); + user.assignContainer(resource, nodePartition); + // Note this is a bit unconventional since it gets the object and modifies + // it here, rather then using set routine + Resources.subtractFrom(application.getHeadroom(), resource); // headroom + metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); + + if (LOG.isDebugEnabled()) { + LOG.debug(getQueueName() + " user=" + userName + " used=" + queueUsage + .getUsed() + " numContainers=" + numContainers + " headroom = " + + application.getHeadroom() + " user-resources=" + user.getUsed()); + } + } finally { + writeLock.unlock(); } } - synchronized void releaseResource(Resource clusterResource, + void releaseResource(Resource clusterResource, FiCaSchedulerApp application, Resource resource, String nodePartition, RMContainer rmContainer, boolean isChangeResource) { - super.releaseResource(clusterResource, resource, nodePartition, - isChangeResource); - - // handle ignore exclusivity container - if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( - RMNodeLabelsManager.NO_LABEL) - && !nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { - if (ignorePartitionExclusivityRMContainers.containsKey(nodePartition)) { - Set rmContainers = - ignorePartitionExclusivityRMContainers.get(nodePartition); - rmContainers.remove(rmContainer); - if (rmContainers.isEmpty()) { - ignorePartitionExclusivityRMContainers.remove(nodePartition); + try { + writeLock.lock(); + super.releaseResource(clusterResource, resource, nodePartition, + isChangeResource); + + // handle ignore exclusivity container + if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( + RMNodeLabelsManager.NO_LABEL) && !nodePartition.equals( + RMNodeLabelsManager.NO_LABEL)) { + if (ignorePartitionExclusivityRMContainers.containsKey(nodePartition)) { + Set rmContainers = + ignorePartitionExclusivityRMContainers.get(nodePartition); + rmContainers.remove(rmContainer); + if (rmContainers.isEmpty()) { + ignorePartitionExclusivityRMContainers.remove(nodePartition); + } } } - } - // Update user metrics - String userName = application.getUser(); - User user = getUser(userName); - user.releaseContainer(resource, nodePartition); - metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); + // Update user metrics + String userName = application.getUser(); + User user = getOrDefault(userName); + user.releaseContainer(resource, nodePartition); + metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); - if (LOG.isDebugEnabled()) { - LOG.debug(getQueueName() + - " used=" + queueUsage.getUsed() + " numContainers=" + numContainers + - " user=" + userName + " user-resources=" + user.getUsed()); + if (LOG.isDebugEnabled()) { + LOG.debug( + getQueueName() + " used=" + queueUsage.getUsed() + " numContainers=" + + numContainers + " user=" + userName + " user-resources=" + + user.getUsed()); + } + } finally { + writeLock.unlock(); } } @@ -1503,39 +1605,44 @@ private void updateCurrentResourceLimits( } @Override - public synchronized void updateClusterResource(Resource clusterResource, + public void updateClusterResource(Resource clusterResource, ResourceLimits currentResourceLimits) { - updateCurrentResourceLimits(currentResourceLimits, clusterResource); - lastClusterResource = clusterResource; - - // Update headroom info based on new cluster resource value - // absoluteMaxCapacity now, will be replaced with absoluteMaxAvailCapacity - // during allocation - setQueueResourceLimitsInfo(clusterResource); - - // Update metrics - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); - - // queue metrics are updated, more resource may be available - // activate the pending applications if possible - activateApplications(); - - // Update application properties - for (FiCaSchedulerApp application : - orderingPolicy.getSchedulableEntities()) { - synchronized (application) { - computeUserLimitAndSetHeadroom(application, clusterResource, - RMNodeLabelsManager.NO_LABEL, - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + try { + writeLock.lock(); + updateCurrentResourceLimits(currentResourceLimits, clusterResource); + lastClusterResource = clusterResource; + + // Update headroom info based on new cluster resource value + // absoluteMaxCapacity now, will be replaced with absoluteMaxAvailCapacity + // during allocation + setQueueResourceLimitsInfo(clusterResource); + + // Update metrics + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); + + // queue metrics are updated, more resource may be available + // activate the pending applications if possible + activateApplications(); + + // Update application properties + for (FiCaSchedulerApp application : orderingPolicy + .getSchedulableEntities()) { + synchronized (application) { + computeUserLimitAndSetHeadroom(application, clusterResource, + RMNodeLabelsManager.NO_LABEL, + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + } } + } finally { + writeLock.unlock(); } } @Override public void incUsedResource(String nodeLabel, Resource resourceToInc, SchedulerApplicationAttempt application) { - getUser(application.getUser()).getResourceUsage().incUsed(nodeLabel, + getOrDefault(application.getUser()).getResourceUsage().incUsed(nodeLabel, resourceToInc); super.incUsedResource(nodeLabel, resourceToInc, application); } @@ -1543,14 +1650,14 @@ public void incUsedResource(String nodeLabel, Resource resourceToInc, @Override public void decUsedResource(String nodeLabel, Resource resourceToDec, SchedulerApplicationAttempt application) { - getUser(application.getUser()).getResourceUsage().decUsed(nodeLabel, + getOrDefault(application.getUser()).getResourceUsage().decUsed(nodeLabel, resourceToDec); super.decUsedResource(nodeLabel, resourceToDec, application); } public void incAMUsedResource(String nodeLabel, Resource resourceToInc, SchedulerApplicationAttempt application) { - getUser(application.getUser()).getResourceUsage().incAMUsed(nodeLabel, + getOrDefault(application.getUser()).getResourceUsage().incAMUsed(nodeLabel, resourceToInc); // ResourceUsage has its own lock, no addition lock needs here. queueUsage.incAMUsed(nodeLabel, resourceToInc); @@ -1558,7 +1665,7 @@ public void incAMUsedResource(String nodeLabel, Resource resourceToInc, public void decAMUsedResource(String nodeLabel, Resource resourceToDec, SchedulerApplicationAttempt application) { - getUser(application.getUser()).getResourceUsage().decAMUsed(nodeLabel, + getOrDefault(application.getUser()).getResourceUsage().decAMUsed(nodeLabel, resourceToDec); // ResourceUsage has its own lock, no addition lock needs here. queueUsage.decAMUsed(nodeLabel, resourceToDec); @@ -1568,8 +1675,8 @@ public void decAMUsedResource(String nodeLabel, Resource resourceToDec, public static class User { ResourceUsage userResourceUsage = new ResourceUsage(); volatile Resource userResourceLimit = Resource.newInstance(0, 0); - int pendingApplications = 0; - int activeApplications = 0; + volatile int pendingApplications = 0; + volatile int activeApplications = 0; public ResourceUsage getResourceUsage() { return userResourceUsage; @@ -1607,7 +1714,7 @@ public int getTotalApplications() { return getPendingApplications() + getActiveApplications(); } - public synchronized void submitApplication() { + public void submitApplication() { ++pendingApplications; } @@ -1649,11 +1756,14 @@ public void recoverContainer(Resource clusterResource, return; } // Careful! Locking order is important! - synchronized (this) { + try { + writeLock.lock(); FiCaSchedulerNode node = scheduler.getNode(rmContainer.getContainer().getNodeId()); allocateResource(clusterResource, attempt, rmContainer.getContainer() .getResource(), node.getPartition(), rmContainer, false); + } finally { + writeLock.unlock(); } getParent().recoverContainer(clusterResource, attempt, rmContainer); } @@ -1678,43 +1788,55 @@ public void recoverContainer(Resource clusterResource, // Total pending for the queue = // sum(for each user(min((user's headroom), sum(user's pending requests)))) // NOTE: Used for calculating pedning resources in the preemption monitor. - public synchronized Resource getTotalPendingResourcesConsideringUserLimit( + public Resource getTotalPendingResourcesConsideringUserLimit( Resource resources, String partition) { - Map userNameToHeadroom = new HashMap(); - Resource pendingConsideringUserLimit = Resource.newInstance(0, 0); - for (FiCaSchedulerApp app : getApplications()) { - String userName = app.getUser(); - if (!userNameToHeadroom.containsKey(userName)) { - User user = getUser(userName); - Resource headroom = Resources.subtract( - computeUserLimit(app, resources, user, partition, - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), - user.getUsed(partition)); - // Make sure headroom is not negative. - headroom = Resources.componentwiseMax(headroom, Resources.none()); - userNameToHeadroom.put(userName, headroom); + try { + readLock.lock(); + Map userNameToHeadroom = + new HashMap(); + Resource pendingConsideringUserLimit = Resource.newInstance(0, 0); + for (FiCaSchedulerApp app : getApplications()) { + String userName = app.getUser(); + if (!userNameToHeadroom.containsKey(userName)) { + User user = getOrDefault(userName); + Resource headroom = Resources.subtract( + computeUserLimit(app, resources, user, partition, + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), + user.getUsed(partition)); + // Make sure headroom is not negative. + headroom = Resources.componentwiseMax(headroom, Resources.none()); + userNameToHeadroom.put(userName, headroom); + } + Resource minpendingConsideringUserLimit = Resources.componentwiseMin( + userNameToHeadroom.get(userName), + app.getAppAttemptResourceUsage().getPending(partition)); + Resources.addTo(pendingConsideringUserLimit, + minpendingConsideringUserLimit); + Resources.subtractFrom(userNameToHeadroom.get(userName), + minpendingConsideringUserLimit); } - Resource minpendingConsideringUserLimit = - Resources.componentwiseMin(userNameToHeadroom.get(userName), - app.getAppAttemptResourceUsage().getPending(partition)); - Resources.addTo(pendingConsideringUserLimit, - minpendingConsideringUserLimit); - Resources.subtractFrom( - userNameToHeadroom.get(userName), minpendingConsideringUserLimit); + return pendingConsideringUserLimit; + } finally { + readLock.unlock(); } - return pendingConsideringUserLimit; } @Override - public synchronized void collectSchedulerApplications( + public void collectSchedulerApplications( Collection apps) { - for (FiCaSchedulerApp pendingApp : pendingOrderingPolicy - .getSchedulableEntities()) { - apps.add(pendingApp.getApplicationAttemptId()); + try { + readLock.lock(); + + for (FiCaSchedulerApp pendingApp : pendingOrderingPolicy + .getSchedulableEntities()) { + apps.add(pendingApp.getApplicationAttemptId()); + } + for (FiCaSchedulerApp app : orderingPolicy.getSchedulableEntities()) { + apps.add(app.getApplicationAttemptId()); + } } - for (FiCaSchedulerApp app : - orderingPolicy.getSchedulableEntities()) { - apps.add(app.getApplicationAttemptId()); + finally { + readLock.unlock(); } } @@ -1756,13 +1878,23 @@ public void detachContainer(Resource clusterResource, /** * return all ignored partition exclusivity RMContainers in the LeafQueue, this - * will be used by preemption policy, and use of return - * ignorePartitionExclusivityRMContainer should protected by LeafQueue - * synchronized lock + * will be used by preemption policy */ - public synchronized Map> - getIgnoreExclusivityRMContainers() { - return ignorePartitionExclusivityRMContainers; + public Map> getCopyOfIgnoreExclusivityRMContainers() { + try { + readLock.lock(); + + Map> clonedMap = new HashMap<>(); + + for (Map.Entry> entry : + ignorePartitionExclusivityRMContainers.entrySet()) { + clonedMap.put(entry.getKey(), new TreeSet<>(entry.getValue())); + } + + return clonedMap; + } finally { + readLock.unlock(); + } } public void setCapacity(float capacity) { @@ -1777,18 +1909,23 @@ public void setMaxApplications(int maxApplications) { this.maxApplications = maxApplications; } - public synchronized OrderingPolicy + public OrderingPolicy getOrderingPolicy() { return orderingPolicy; } - public synchronized void setOrderingPolicy( + public void setOrderingPolicy( OrderingPolicy orderingPolicy) { - if (null != this.orderingPolicy) { - orderingPolicy.addAllSchedulableEntities(this.orderingPolicy - .getSchedulableEntities()); + try { + writeLock.lock(); + if (null != this.orderingPolicy) { + orderingPolicy.addAllSchedulableEntities(this.orderingPolicy + .getSchedulableEntities()); + } + this.orderingPolicy = orderingPolicy; + } finally { + writeLock.unlock(); } - this.orderingPolicy = orderingPolicy; } @Override @@ -1816,7 +1953,9 @@ public void decreaseContainer(Resource clusterResource, boolean resourceDecreased = false; Resource resourceBeforeDecrease; // Grab queue lock to avoid race condition when getting container resource - synchronized (this) { + try { + writeLock.lock(); + // Make sure the decrease request is valid in terms of current resource // and target resource. This must be done under the leaf queue lock. // Throws exception if the check fails. @@ -1861,6 +2000,8 @@ public void decreaseContainer(Resource clusterResource, .decreaseContainer(decreaseRequest.getContainerId(), absDelta); resourceDecreased = true; } + } finally { + writeLock.unlock(); } if (resourceDecreased) { @@ -1873,7 +2014,138 @@ public void decreaseContainer(Resource clusterResource, } } - public synchronized OrderingPolicy + @Override + public boolean acceptCSAssignment(Resource cluster, + ResourceCommitRequest request) { + // If we allocated something + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationContext + allocation = request.getFirstAllocatedOrReservedContainer(); + SchedulerContainer + schedulerContainer = allocation.getAllocatedOrReservedContainer(); + + // Do not check limits when allocation from a reserved container + if (allocation.getAllocateFromReservedContainer() == null) { + try { + readLock.lock(); + FiCaSchedulerApp app = schedulerContainer.getSchedulerApplicationAttempt(); + String username = app.getUser(); + String p = schedulerContainer.getNodePartition(); + + // check user-limit + Resource userLimit = computeUserLimitAndSetHeadroom(app, cluster, p, + allocation.getSchedulingMode()); + + // Deduct resources that we can release + Resource usedResource = Resources.clone(getOrDefault(username).getUsed(p)); + Resources.subtractFrom(usedResource, request.getTotalReleasedResource()); + + if (Resources.greaterThan(resourceCalculator, cluster, usedResource, + userLimit)) { + return false; + } + } + finally { + readLock.unlock(); + } + } + } + + return super.acceptCSAssignment(cluster, request); + } + + private void releaseContainers(Resource clusterResource, + ResourceCommitRequest request) { + ApplicationAttemptId allocatedApplicationAttemptId = null; + if (request.anythingAllocatedOrReserved()) { + allocatedApplicationAttemptId = + request.getFirstAllocatedOrReservedContainer() + .getAllocatedOrReservedContainer().getSchedulerApplicationAttempt() + .getApplicationAttemptId(); + } + + for (SchedulerContainer c : request + .getContainersToRelease()) { + RMContainer rmContainer = c.getRmContainer(); + + if (rmContainer.hasIncreaseReservation()) { + // Increased container reservation + unreserveIncreasedContainer(clusterResource, + c.getSchedulerApplicationAttempt(), c.getSchedulerNode(), + rmContainer); + continue; + } else if (rmContainer.getState() == RMContainerState.RESERVED) { + // For other reserved containers + // This is a reservation exchange, complete previous reserved container + completedContainer(clusterResource, c.getSchedulerApplicationAttempt(), + c.getSchedulerNode(), rmContainer, SchedulerUtils + .createAbnormalContainerStatus(rmContainer.getContainerId(), + SchedulerUtils.UNRESERVED_CONTAINER), + RMContainerEventType.RELEASED, null, false); + continue; + } + + // This is a container preemption, TODO + /* + LeafQueue targetLeafQueue = + c.getSchedulerApplicationAttempt().getCSLeafQueue(); + targetLeafQueue.completedContainer(clusterResource, + c.getSchedulerApplicationAttempt(), c.getSchedulerNode(), c, + SchedulerUtils + .createPreemptedContainerStatus(rmContainer.getContainerId(), + SchedulerUtils.PREEMPTED_CONTAINER), + RMContainerEventType.KILL, null, false); + */ + } + } + + public void applyResourceCommitRequest(Resource cluster, + ResourceCommitRequest request) { + try { + writeLock.lock(); + + releaseContainers(cluster, request); + + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationContext + allocation = request.getFirstAllocatedOrReservedContainer(); + SchedulerContainer + schedulerContainer = allocation.getAllocatedOrReservedContainer(); + + // Do not modify queue when allocation from reserved container + if (allocation.getAllocateFromReservedContainer() == null) { + // Book-keeping + // Note: Update headroom to account for current allocation too... + allocateResource(cluster, + schedulerContainer.getSchedulerApplicationAttempt(), + allocation.getAllocatedOrReservedResource(), + schedulerContainer.getNodePartition(), + schedulerContainer.getRmContainer(), + allocation.isIncreasedAllocation()); + } + + // Update reserved resource + if (Resources.greaterThan(resourceCalculator, cluster, + request.getTotalReservedResource(), Resources.none())) { + incReservedResource(schedulerContainer.getNodePartition(), + request.getTotalReservedResource()); + } + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, + schedulerContainer.getSchedulerNode(), getParent().getQueueName(), + getQueueName(), ActivityState.ACCEPTED, + ActivityDiagnosticConstant.EMPTY); + } + } finally { + writeLock.unlock(); + } + + if (parent != null) { + ((ParentQueue) parent).applyResourceCommitRequest(cluster, request); + } + } + + public OrderingPolicy getPendingAppsOrderingPolicy() { return pendingOrderingPolicy; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java index a245e3b..86e0403 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; +import com.google.common.collect.Sets; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -42,11 +43,21 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.*; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityDiagnosticConstant; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesLogger; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityDiagnosticConstant; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.AllocationState; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ContainerAllocationContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ResourceCommitRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.Resources; @@ -62,6 +73,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; @Private @Evolving @@ -69,7 +81,7 @@ private static final Log LOG = LogFactory.getLog(ParentQueue.class); - protected final Set childQueues; + protected final Set childQueues; private final boolean rootQueue; final Comparator nonPartitionedQueueComparator; final PartitionedQueueComparator partitionQueueComparator; @@ -98,8 +110,9 @@ public ParentQueue(CapacitySchedulerContext cs, ". Must be " + CapacitySchedulerConfiguration.MAXIMUM_CAPACITY_VALUE); } - this.childQueues = new TreeSet(nonPartitionedQueueComparator); - + Map map = new ConcurrentHashMap<>(); + this.childQueues = Sets.newSetFromMap(map); + setupQueueConfigs(cs.getClusterResource()); LOG.info("Initialized parent-queue " + queueName + @@ -107,68 +120,76 @@ public ParentQueue(CapacitySchedulerContext cs, ", fullname=" + getQueuePath()); } - synchronized void setupQueueConfigs(Resource clusterResource) + void setupQueueConfigs(Resource clusterResource) throws IOException { - super.setupQueueConfigs(clusterResource); - StringBuilder aclsString = new StringBuilder(); - for (Map.Entry e : acls.entrySet()) { - aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); - } + try { - StringBuilder labelStrBuilder = new StringBuilder(); - if (accessibleLabels != null) { - for (String s : accessibleLabels) { - labelStrBuilder.append(s); - labelStrBuilder.append(","); + writeLock.lock(); + super.setupQueueConfigs(clusterResource); + StringBuilder aclsString = new StringBuilder(); + for (Map.Entry e : acls.entrySet()) { + aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); } - } - LOG.info(queueName + - ", capacity=" + this.queueCapacities.getCapacity() + - ", asboluteCapacity=" + this.queueCapacities.getAbsoluteCapacity() + - ", maxCapacity=" + this.queueCapacities.getMaximumCapacity() + - ", asboluteMaxCapacity=" + this.queueCapacities.getAbsoluteMaximumCapacity() + - ", state=" + state + - ", acls=" + aclsString + - ", labels=" + labelStrBuilder.toString() + "\n" + - ", reservationsContinueLooking=" + reservationsContinueLooking); + StringBuilder labelStrBuilder = new StringBuilder(); + if (accessibleLabels != null) { + for (String s : accessibleLabels) { + labelStrBuilder.append(s); + labelStrBuilder.append(","); + } + } + + LOG.info(queueName + ", capacity=" + this.queueCapacities.getCapacity() + + ", asboluteCapacity=" + this.queueCapacities.getAbsoluteCapacity() + + ", maxCapacity=" + this.queueCapacities.getMaximumCapacity() + + ", asboluteMaxCapacity=" + this.queueCapacities + .getAbsoluteMaximumCapacity() + ", state=" + state + ", acls=" + + aclsString + ", labels=" + labelStrBuilder.toString() + "\n" + + ", reservationsContinueLooking=" + reservationsContinueLooking); + } finally { + writeLock.unlock(); + } } private static float PRECISION = 0.0005f; // 0.05% precision - synchronized void setChildQueues(Collection childQueues) { - // Validate - float childCapacities = 0; - for (CSQueue queue : childQueues) { - childCapacities += queue.getCapacity(); - } - float delta = Math.abs(1.0f - childCapacities); // crude way to check - // allow capacities being set to 0, and enforce child 0 if parent is 0 - if (((queueCapacities.getCapacity() > 0) && (delta > PRECISION)) || - ((queueCapacities.getCapacity() == 0) && (childCapacities > 0))) { - throw new IllegalArgumentException("Illegal" + - " capacity of " + childCapacities + - " for children of queue " + queueName); - } - // check label capacities - for (String nodeLabel : queueCapacities.getExistingNodeLabels()) { - float capacityByLabel = queueCapacities.getCapacity(nodeLabel); - // check children's labels - float sum = 0; + void setChildQueues(Collection childQueues) { + try { + writeLock.lock(); + // Validate + float childCapacities = 0; for (CSQueue queue : childQueues) { - sum += queue.getQueueCapacities().getCapacity(nodeLabel); + childCapacities += queue.getCapacity(); } - if ((capacityByLabel > 0 && Math.abs(1.0f - sum) > PRECISION) - || (capacityByLabel == 0) && (sum > 0)) { - throw new IllegalArgumentException("Illegal" + " capacity of " - + sum + " for children of queue " + queueName - + " for label=" + nodeLabel); + float delta = Math.abs(1.0f - childCapacities); // crude way to check + // allow capacities being set to 0, and enforce child 0 if parent is 0 + if (((queueCapacities.getCapacity() > 0) && (delta > PRECISION)) || ( + (queueCapacities.getCapacity() == 0) && (childCapacities > 0))) { + throw new IllegalArgumentException("Illegal" + " capacity of " + childCapacities + + " for children of queue " + queueName); } - } - - this.childQueues.clear(); - this.childQueues.addAll(childQueues); - if (LOG.isDebugEnabled()) { - LOG.debug("setChildQueues: " + getChildQueuesToPrint()); + // check label capacities + for (String nodeLabel : queueCapacities.getExistingNodeLabels()) { + float capacityByLabel = queueCapacities.getCapacity(nodeLabel); + // check children's labels + float sum = 0; + for (CSQueue queue : childQueues) { + sum += queue.getQueueCapacities().getCapacity(nodeLabel); + } + if ((capacityByLabel > 0 && Math.abs(1.0f - sum) > PRECISION) + || (capacityByLabel == 0) && (sum > 0)) { + throw new IllegalArgumentException( + "Illegal" + " capacity of " + sum + " for children of queue " + queueName + + " for label=" + nodeLabel); + } + } + + this.childQueues.clear(); + this.childQueues.addAll(childQueues); + if (LOG.isDebugEnabled()) { + LOG.debug("setChildQueues: " + getChildQueuesToPrint()); + } + } finally { + writeLock.unlock(); } } @@ -179,53 +200,67 @@ public String getQueuePath() { } @Override - public synchronized QueueInfo getQueueInfo( + public QueueInfo getQueueInfo( boolean includeChildQueues, boolean recursive) { - QueueInfo queueInfo = getQueueInfo(); - - List childQueuesInfo = new ArrayList(); - if (includeChildQueues) { - for (CSQueue child : childQueues) { - // Get queue information recursively? - childQueuesInfo.add( - child.getQueueInfo(recursive, recursive)); + try { + readLock.lock(); + QueueInfo queueInfo = getQueueInfo(); + + List childQueuesInfo = new ArrayList(); + if (includeChildQueues) { + for (CSQueue child : childQueues) { + // Get queue information recursively? + childQueuesInfo.add(child.getQueueInfo(recursive, recursive)); + } } + queueInfo.setChildQueues(childQueuesInfo); + + return queueInfo; + } finally { + readLock.unlock(); } - queueInfo.setChildQueues(childQueuesInfo); - - return queueInfo; } - private synchronized QueueUserACLInfo getUserAclInfo( + private QueueUserACLInfo getUserAclInfo( UserGroupInformation user) { - QueueUserACLInfo userAclInfo = - recordFactory.newRecordInstance(QueueUserACLInfo.class); - List operations = new ArrayList(); - for (QueueACL operation : QueueACL.values()) { - if (hasAccess(operation, user)) { - operations.add(operation); - } - } + try { + readLock.lock(); + QueueUserACLInfo userAclInfo = recordFactory.newRecordInstance( + QueueUserACLInfo.class); + List operations = new ArrayList(); + for (QueueACL operation : QueueACL.values()) { + if (hasAccess(operation, user)) { + operations.add(operation); + } + } - userAclInfo.setQueueName(getQueueName()); - userAclInfo.setUserAcls(operations); - return userAclInfo; + userAclInfo.setQueueName(getQueueName()); + userAclInfo.setUserAcls(operations); + return userAclInfo; + } finally { + readLock.unlock(); + } } @Override - public synchronized List getQueueUserAclInfo( + public List getQueueUserAclInfo( UserGroupInformation user) { - List userAcls = new ArrayList(); - - // Add parent queue acls - userAcls.add(getUserAclInfo(user)); - - // Add children queue acls - for (CSQueue child : childQueues) { - userAcls.addAll(child.getQueueUserAclInfo(user)); + try { + readLock.lock(); + List userAcls = new ArrayList(); + + // Add parent queue acls + userAcls.add(getUserAclInfo(user)); + + // Add children queue acls + for (CSQueue child : childQueues) { + userAcls.addAll(child.getQueueUserAclInfo(user)); + } + + return userAcls; + } finally { + readLock.unlock(); } - - return userAcls; } public String toString() { @@ -240,55 +275,58 @@ public String toString() { } @Override - public synchronized void reinitialize(CSQueue newlyParsedQueue, + public void reinitialize(CSQueue newlyParsedQueue, Resource clusterResource) throws IOException { - // Sanity check - if (!(newlyParsedQueue instanceof ParentQueue) || - !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { - throw new IOException("Trying to reinitialize " + getQueuePath() + - " from " + newlyParsedQueue.getQueuePath()); - } + try { + writeLock.lock(); + // Sanity check + if (!(newlyParsedQueue instanceof ParentQueue) || !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { + throw new IOException( + "Trying to reinitialize " + getQueuePath() + " from " + newlyParsedQueue.getQueuePath()); + } - ParentQueue newlyParsedParentQueue = (ParentQueue)newlyParsedQueue; + ParentQueue newlyParsedParentQueue = (ParentQueue) newlyParsedQueue; - // Set new configs - setupQueueConfigs(clusterResource); + // Set new configs + setupQueueConfigs(clusterResource); - // Re-configure existing child queues and add new ones - // The CS has already checked to ensure all existing child queues are present! - Map currentChildQueues = getQueues(childQueues); - Map newChildQueues = - getQueues(newlyParsedParentQueue.childQueues); - for (Map.Entry e : newChildQueues.entrySet()) { - String newChildQueueName = e.getKey(); - CSQueue newChildQueue = e.getValue(); + // Re-configure existing child queues and add new ones + // The CS has already checked to ensure all existing child queues are present! + Map currentChildQueues = getQueues(childQueues); + Map newChildQueues = getQueues(newlyParsedParentQueue.childQueues); + for (Map.Entry e : newChildQueues.entrySet()) { + String newChildQueueName = e.getKey(); + CSQueue newChildQueue = e.getValue(); - CSQueue childQueue = currentChildQueues.get(newChildQueueName); - - // Check if the child-queue already exists - if (childQueue != null) { - // Re-init existing child queues - childQueue.reinitialize(newChildQueue, clusterResource); - LOG.info(getQueueName() + ": re-configured queue: " + childQueue); - } else { - // New child queue, do not re-init - - // Set parent to 'this' - newChildQueue.setParent(this); - - // Save in list of current child queues - currentChildQueues.put(newChildQueueName, newChildQueue); - - LOG.info(getQueueName() + ": added new child queue: " + newChildQueue); + CSQueue childQueue = currentChildQueues.get(newChildQueueName); + + // Check if the child-queue already exists + if (childQueue != null) { + // Re-init existing child queues + childQueue.reinitialize(newChildQueue, clusterResource); + LOG.info(getQueueName() + ": re-configured queue: " + childQueue); + } else { + // New child queue, do not re-init + + // Set parent to 'this' + newChildQueue.setParent(this); + + // Save in list of current child queues + currentChildQueues.put(newChildQueueName, newChildQueue); + + LOG.info(getQueueName() + ": added new child queue: " + newChildQueue); + } } - } - // Re-sort all queues - childQueues.clear(); - childQueues.addAll(currentChildQueues.values()); + // Re-sort all queues + childQueues.clear(); + childQueues.addAll(currentChildQueues.values()); + } finally { + writeLock.unlock(); + } } - Map getQueues(Set queues) { + private Map getQueues(Set queues) { Map queuesMap = new HashMap(); for (CSQueue queue : queues) { queuesMap.put(queue.getQueueName(), queue); @@ -300,7 +338,8 @@ public synchronized void reinitialize(CSQueue newlyParsedQueue, public void submitApplication(ApplicationId applicationId, String user, String queue) throws AccessControlException { - synchronized (this) { + try { + writeLock.lock(); // Sanity check if (queue.equals(queueName)) { throw new AccessControlException("Cannot submit application " + @@ -314,6 +353,8 @@ public void submitApplication(ApplicationId applicationId, String user, } addApplication(applicationId, user); + } finally { + writeLock.unlock(); } // Inform the parent queue @@ -342,7 +383,7 @@ public void finishApplicationAttempt(FiCaSchedulerApp application, // finish attempt logic. } - private synchronized void addApplication(ApplicationId applicationId, + private void addApplication(ApplicationId applicationId, String user) { ++numApplications; @@ -356,10 +397,7 @@ private synchronized void addApplication(ApplicationId applicationId, @Override public void finishApplication(ApplicationId application, String user) { - - synchronized (this) { - removeApplication(application, user); - } + removeApplication(application, user); // Inform the parent queue if (parent != null) { @@ -367,16 +405,16 @@ public void finishApplication(ApplicationId application, String user) { } } - private synchronized void removeApplication(ApplicationId applicationId, + private void removeApplication(ApplicationId applicationId, String user) { - --numApplications; + int num = --numApplications; LOG.info("Application removed -" + " appId: " + applicationId + " user: " + user + " leaf-queue of parent: " + getQueueName() + - " #applications: " + getNumApplications()); + " #applications: " + num); } private String getParentName() { @@ -384,186 +422,215 @@ private String getParentName() { } @Override - public synchronized CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, ResourceLimits resourceLimits, + public CSAssignment assignContainers(Resource clusterResource, + PlacementSet placementSet, ResourceLimits resourceLimits, SchedulingMode schedulingMode) { - // if our queue cannot access this node, just return - if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY - && !accessibleToPartition(node.getPartition())) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it is not able to access partition=" + node - .getPartition()); - } - - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.REJECTED, - ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node - .getPartition()); - if (rootQueue) { - ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, - node); - } + try { + writeLock.lock(); + String partition = placementSet.getPartition(); + SchedulerNode node = placementSet.getNextAvailable(); + + // if our queue cannot access this node, just return + if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY + && !accessibleToPartition(placementSet.getPartition())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it is not able to access partition=" + partition); + } - return CSAssignment.NULL_ASSIGNMENT; - } - - // Check if this queue need more resource, simply skip allocation if this - // queue doesn't need more resources. - if (!super.hasPendingResourceRequest(node.getPartition(), - clusterResource, schedulingMode)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it doesn't need more resource, schedulingMode=" - + schedulingMode.name() + " node-partition=" + node.getPartition()); - } + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.REJECTED, + ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node + .getPartition()); + if (rootQueue) { + ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, + node); + } - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE); - if (rootQueue) { - ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, - node); + return CSAssignment.NULL_ASSIGNMENT; } - return CSAssignment.NULL_ASSIGNMENT; - } - - CSAssignment assignment = - new CSAssignment(Resources.createResource(0, 0), NodeType.NODE_LOCAL); - - while (canAssign(clusterResource, node)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Trying to assign containers to child-queue of " - + getQueueName()); - } - - // Are we over maximum-capacity for this queue? - // This will also consider parent's limits and also continuous reservation - // looking - if (!super.canAssignToThisQueue(clusterResource, node.getPartition(), - resourceLimits, Resources.createResource( - getMetrics().getReservedMB(), getMetrics() - .getReservedVirtualCores()), schedulingMode)) { + // Check if this queue need more resource, simply skip allocation if this + // queue doesn't need more resources. + if (!super.hasPendingResourceRequest(partition, clusterResource, + schedulingMode)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it doesn't need more resource, schedulingMode=" + + schedulingMode.name() + " node-partition=" + partition); + } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, getParentName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT); + ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE); if (rootQueue) { ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, node); } - break; + return CSAssignment.NULL_ASSIGNMENT; } - // Schedule - CSAssignment assignedToChild = - assignContainersToChildQueues(clusterResource, node, resourceLimits, - schedulingMode); - assignment.setType(assignedToChild.getType()); - - // Done if no child-queue assigned anything - if (Resources.greaterThan( - resourceCalculator, clusterResource, - assignedToChild.getResource(), Resources.none())) { + CSAssignment assignment = new CSAssignment(Resources.createResource(0, 0), + NodeType.NODE_LOCAL); - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.ACCEPTED, - ActivityDiagnosticConstant.EMPTY); + while (canAssign(clusterResource, + (FiCaSchedulerNode) placementSet.getNextAvailable())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Trying to assign containers to child-queue of " + + getQueueName()); + } - if (node.getReservedContainer() == null) { - if (rootQueue) { - ActivitiesLogger.NODE.finishAllocatedNodeAllocation( - activitiesManager, node, - assignedToChild.getAssignmentInformation() - .getFirstAllocatedOrReservedContainerId(), - AllocationState.ALLOCATED); - } - } else { + // Are we over maximum-capacity for this queue? + // This will also consider parent's limits and also continuous reservation + // looking + if (!super.canAssignToThisQueue(clusterResource, partition, + resourceLimits, Resources + .createResource(getMetrics().getReservedMB(), + getMetrics().getReservedVirtualCores()), schedulingMode)) { + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT); if (rootQueue) { - ActivitiesLogger.NODE.finishAllocatedNodeAllocation( - activitiesManager, node, - assignedToChild.getAssignmentInformation() - .getFirstAllocatedOrReservedContainerId(), - AllocationState.RESERVED); + ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, + node); } + + break; } - // Track resource utilization for the parent-queue - allocateResource(clusterResource, assignedToChild.getResource(), - node.getPartition(), assignedToChild.isIncreasedAllocation()); - - // Track resource utilization in this pass of the scheduler - Resources - .addTo(assignment.getResource(), assignedToChild.getResource()); - Resources.addTo(assignment.getAssignmentInformation().getAllocated(), - assignedToChild.getAssignmentInformation().getAllocated()); - Resources.addTo(assignment.getAssignmentInformation().getReserved(), - assignedToChild.getAssignmentInformation().getReserved()); - assignment.getAssignmentInformation().incrAllocations( - assignedToChild.getAssignmentInformation().getNumAllocations()); - assignment.getAssignmentInformation().incrReservations( - assignedToChild.getAssignmentInformation().getNumReservations()); - assignment - .getAssignmentInformation() - .getAllocationDetails() - .addAll( - assignedToChild.getAssignmentInformation().getAllocationDetails()); - assignment - .getAssignmentInformation() - .getReservationDetails() - .addAll( + // Schedule + CSAssignment assignedToChild = assignContainersToChildQueues( + clusterResource, placementSet, resourceLimits, schedulingMode); + assignment.setType(assignedToChild.getType()); + assignment.setExcessReservation(assignedToChild.getExcessReservation()); + + // Done if no child-queue assigned anything + if (Resources.greaterThan(resourceCalculator, clusterResource, + assignedToChild.getResource(), Resources.none())) { + assignment.setFulfilledReservedContainer( + assignedToChild.getFulfilledReservedContainer()); + assignment.setFulfilledReservation( + assignedToChild.isFulfilledReservation()); + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.ACCEPTED, + ActivityDiagnosticConstant.EMPTY); + + // FIXME: global scheduling + if (node != null) { + if (node.getReservedContainer() == null) { + if (rootQueue) { + ActivitiesLogger.NODE.finishAllocatedNodeAllocation( + activitiesManager, node, + assignedToChild.getAssignmentInformation() + .getFirstAllocatedOrReservedContainerId(), + AllocationState.ALLOCATED); + } + } else { + if (rootQueue) { + ActivitiesLogger.NODE.finishAllocatedNodeAllocation( + activitiesManager, node, + assignedToChild.getAssignmentInformation() + .getFirstAllocatedOrReservedContainerId(), + AllocationState.RESERVED); + } + } + } + + // Track resource utilization for the parent-queue + /* Handled by apply commit + allocateResource(clusterResource, assignedToChild.getResource(), + partition, assignedToChild.isIncreasedAllocation()); + */ + + // Track resource utilization in this pass of the scheduler + Resources.addTo(assignment.getResource(), + assignedToChild.getResource()); + Resources.addTo(assignment.getAssignmentInformation().getAllocated(), + assignedToChild.getAssignmentInformation().getAllocated()); + Resources.addTo(assignment.getAssignmentInformation().getReserved(), + assignedToChild.getAssignmentInformation().getReserved()); + assignment.getAssignmentInformation().incrAllocations( + assignedToChild.getAssignmentInformation().getNumAllocations()); + assignment.getAssignmentInformation().incrReservations( + assignedToChild.getAssignmentInformation().getNumReservations()); + assignment.getAssignmentInformation().getAllocationDetails().addAll( + assignedToChild.getAssignmentInformation() + .getAllocationDetails()); + assignment.getAssignmentInformation().getReservationDetails().addAll( assignedToChild.getAssignmentInformation() .getReservationDetails()); - assignment.setIncreasedAllocation(assignedToChild - .isIncreasedAllocation()); - - LOG.info("assignedContainer" + - " queue=" + getQueueName() + - " usedCapacity=" + getUsedCapacity() + - " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + - " used=" + queueUsage.getUsed() + - " cluster=" + clusterResource); - - } else { - assignment.setSkippedType(assignedToChild.getSkippedType()); + assignment.setIncreasedAllocation( + assignedToChild.isIncreasedAllocation()); - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.EMPTY); - if (rootQueue) { - ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, - node); - } + LOG.info("assignedContainer" + " queue=" + getQueueName() + + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity=" + + getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed() + + " cluster=" + clusterResource); - break; - } + } else { + assignment.setSkippedType(assignedToChild.getSkippedType()); - if (LOG.isDebugEnabled()) { - LOG.debug("ParentQ=" + getQueueName() - + " assignedSoFarInThisIteration=" + assignment.getResource() - + " usedCapacity=" + getUsedCapacity() - + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity()); - } + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.EMPTY); + if (rootQueue) { + ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, + node); + } + + break; + } + + LOG.info( + "assignedContainer" + " queue=" + getQueueName() + " usedCapacity=" + + getUsedCapacity() + " absoluteUsedCapacity=" + + getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed() + + " cluster=" + clusterResource); - // Do not assign more than one container if this isn't the root queue - // or if we've already assigned an off-switch container - if (!rootQueue || assignment.getType() == NodeType.OFF_SWITCH) { if (LOG.isDebugEnabled()) { - if (rootQueue && assignment.getType() == NodeType.OFF_SWITCH) { - LOG.debug("Not assigning more than one off-switch container," + - " assignments so far: " + assignment); + LOG.debug( + "ParentQ=" + getQueueName() + " assignedSoFarInThisIteration=" + + assignment.getResource() + " usedCapacity=" + + getUsedCapacity() + " absoluteUsedCapacity=" + + getAbsoluteUsedCapacity()); + } + + /* Do not try to allocate more than one container for each allocation + Let top scheduler make the decision + // Do not assign more than one container if this isn't the root queue + // or if we've already assigned an off-switch container + if (!rootQueue || assignment.getType() == NodeType.OFF_SWITCH) { + if (LOG.isDebugEnabled()) { + if (rootQueue && assignment.getType() == NodeType.OFF_SWITCH) { + LOG.debug("Not assigning more than one off-switch container," + + " assignments so far: " + assignment); + } } + break; } + */ break; } - } - - return assignment; + + return assignment; + } + finally { + writeLock.unlock(); + } } + // FIXME: + // Only check next-node.available resource only at root queue, and check queue's + // available resource for partition private boolean canAssign(Resource clusterResource, FiCaSchedulerNode node) { + // Always return true when global scheduling enabled; + if (csContext.globalSchedulingEnabled()) { + return true; + } + // Two conditions need to meet when trying to allocate: // 1) Node doesn't have reserved container // 2) Node's available-resource + killable-resource should > 0 @@ -609,36 +676,27 @@ private ResourceLimits getResourceLimitsOfChild(CSQueue child, return new ResourceLimits(childLimit); } - private Iterator sortAndGetChildrenAllocationIterator(FiCaSchedulerNode node) { - if (node.getPartition().equals(RMNodeLabelsManager.NO_LABEL)) { - if (needToResortQueuesAtNextAllocation) { - // If we skipped resort queues last time, we need to re-sort queue - // before allocation - List childrenList = new ArrayList<>(childQueues); - childQueues.clear(); - childQueues.addAll(childrenList); - needToResortQueuesAtNextAllocation = false; - } - return childQueues.iterator(); - } - - partitionQueueComparator.setPartitionToLookAt(node.getPartition()); + private Iterator sortAndGetChildrenAllocationIterator(String partition) { + partitionQueueComparator.setPartitionToLookAt(partition); List childrenList = new ArrayList<>(childQueues); Collections.sort(childrenList, partitionQueueComparator); return childrenList.iterator(); } - + private synchronized CSAssignment assignContainersToChildQueues( - Resource cluster, FiCaSchedulerNode node, ResourceLimits limits, + Resource cluster, PlacementSet placementSet, ResourceLimits limits, SchedulingMode schedulingMode) { CSAssignment assignment = CSAssignment.NULL_ASSIGNMENT; + String partition = placementSet.getPartition(); + Resource parentLimits = limits.getLimit(); + printChildQueues(); // Try to assign to most 'under-served' sub-queue - for (Iterator iter = sortAndGetChildrenAllocationIterator(node); iter - .hasNext();) { + for (Iterator iter = sortAndGetChildrenAllocationIterator( + partition); iter.hasNext(); ) { CSQueue childQueue = iter.next(); if(LOG.isDebugEnabled()) { LOG.debug("Trying to assign to queue: " + childQueue.getQueuePath() @@ -646,12 +704,11 @@ private synchronized CSAssignment assignContainersToChildQueues( } // Get ResourceLimits of child queue before assign containers - ResourceLimits childLimits = - getResourceLimitsOfChild(childQueue, cluster, parentLimits, - node.getPartition()); - - CSAssignment childAssignment = childQueue.assignContainers(cluster, node, - childLimits, schedulingMode); + ResourceLimits childLimits = getResourceLimitsOfChild(childQueue, cluster, + parentLimits, partition); + + CSAssignment childAssignment = childQueue.assignContainers(cluster, + placementSet, childLimits, schedulingMode); if(LOG.isDebugEnabled()) { LOG.debug("Assigned to queue: " + childQueue.getQueuePath() + " stats: " + childQueue + " --> " + @@ -664,7 +721,8 @@ private synchronized CSAssignment assignContainersToChildQueues( childAssignment.getResource(), Resources.none())) { // Only update childQueues when we doing non-partitioned node // allocation. - if (RMNodeLabelsManager.NO_LABEL.equals(node.getPartition())) { + /* We now sort queue everytime + if (RMNodeLabelsManager.NO_LABEL.equals(partition)) { // Remove and re-insert to sort iter.remove(); LOG.info("Re-sorting assigned queue: " + childQueue.getQueuePath() @@ -674,6 +732,7 @@ private synchronized CSAssignment assignContainersToChildQueues( printChildQueues(); } } + */ assignment = childAssignment; break; } else if (childAssignment.getSkippedType() == @@ -716,40 +775,51 @@ private void printChildQueues() { + " child-queues: " + getChildQueuesToPrint()); } } - - private synchronized void internalReleaseResource(Resource clusterResource, - FiCaSchedulerNode node, Resource releasedResource, boolean changeResource, - CSQueue completedChildQueue, boolean sortQueues) { - super.releaseResource(clusterResource, - releasedResource, node.getPartition(), - changeResource); - if (LOG.isDebugEnabled()) { - LOG.debug("completedContainer " + this + ", cluster=" + clusterResource); - } + private void internalReleaseResource(Resource clusterResource, + FiCaSchedulerNode node, Resource releasedResource, + boolean changeResource) { + try { + writeLock.lock(); + super.releaseResource(clusterResource, releasedResource, + node.getPartition(), changeResource); - // Note that this is using an iterator on the childQueues so this can't - // be called if already within an iterator for the childQueues. Like - // from assignContainersToChildQueues. - if (sortQueues) { - // reinsert the updated queue - for (Iterator iter = childQueues.iterator(); iter.hasNext();) { - CSQueue csqueue = iter.next(); - if (csqueue.equals(completedChildQueue)) { - iter.remove(); - if (LOG.isDebugEnabled()) { - LOG.debug("Re-sorting completed queue: " + csqueue); + if (LOG.isDebugEnabled()) { + LOG.debug( + "completedContainer " + this + ", cluster=" + clusterResource); + } + + // Note that this is using an iterator on the childQueues so this can't + // be called if already within an iterator for the childQueues. Like + // from assignContainersToChildQueues. + // TODO, we don't sort queue in any cases, need to figure out + // if it causes performance regression + /* + if (sortQueues) { + // reinsert the updated queue + for (Iterator iter = childQueues.iterator(); + iter.hasNext(); ) { + CSQueue csqueue = iter.next(); + if (csqueue.equals(completedChildQueue)) { + iter.remove(); + if (LOG.isDebugEnabled()) { + LOG.debug("Re-sorting completed queue: " + csqueue); + } + childQueues.add(csqueue); + break; } - childQueues.add(csqueue); - break; } } - } - // If we skipped sort queue this time, we need to resort queues to make - // sure we allocate from least usage (or order defined by queue policy) - // queues. - needToResortQueuesAtNextAllocation = !sortQueues; + // If we skipped sort queue this time, we need to resort queues to make + // sure we allocate from least usage (or order defined by queue policy) + // queues. + needToResortQueuesAtNextAllocation = !sortQueues; + */ + } + finally { + writeLock.unlock(); + } } @Override @@ -761,8 +831,8 @@ public void decreaseContainer(Resource clusterResource, Resources.negate(decreaseRequest.getDeltaCapacity()); internalReleaseResource(clusterResource, - csContext.getNode(decreaseRequest.getNodeId()), absDeltaCapacity, false, - null, false); + csContext.getNode(decreaseRequest.getNodeId()), absDeltaCapacity, + false); // Inform the parent if (parent != null) { @@ -775,7 +845,7 @@ public void unreserveIncreasedContainer(Resource clusterResource, FiCaSchedulerApp app, FiCaSchedulerNode node, RMContainer rmContainer) { if (app != null) { internalReleaseResource(clusterResource, node, - rmContainer.getReservedResource(), false, null, false); + rmContainer.getReservedResource(), false); // Inform the parent if (parent != null) { @@ -793,8 +863,7 @@ public void completedContainer(Resource clusterResource, boolean sortQueues) { if (application != null) { internalReleaseResource(clusterResource, node, - rmContainer.getContainer().getResource(), false, completedChildQueue, - sortQueues); + rmContainer.getContainer().getResource(), false); // Inform the parent if (parent != null) { @@ -806,24 +875,33 @@ public void completedContainer(Resource clusterResource, } @Override - public synchronized void updateClusterResource(Resource clusterResource, + public void updateClusterResource(Resource clusterResource, ResourceLimits resourceLimits) { - // Update all children - for (CSQueue childQueue : childQueues) { - // Get ResourceLimits of child queue before assign containers - ResourceLimits childLimits = getResourceLimitsOfChild(childQueue, - clusterResource, resourceLimits.getLimit(), - RMNodeLabelsManager.NO_LABEL); - childQueue.updateClusterResource(clusterResource, childLimits); + try { + writeLock.lock(); + // Update all children + for (CSQueue childQueue : childQueues) { + // Get ResourceLimits of child queue before assign containers + ResourceLimits childLimits = getResourceLimitsOfChild(childQueue, + clusterResource, resourceLimits.getLimit(), RMNodeLabelsManager.NO_LABEL); + childQueue.updateClusterResource(clusterResource, childLimits); + } + + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); + } finally { + writeLock.unlock(); } - - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); } @Override - public synchronized List getChildQueues() { - return new ArrayList(childQueues); + public List getChildQueues() { + try { + readLock.lock(); + return new ArrayList(childQueues); + } finally { + readLock.unlock(); + } } @Override @@ -833,11 +911,14 @@ public void recoverContainer(Resource clusterResource, return; } // Careful! Locking order is important! - synchronized (this) { + try { + writeLock.lock(); FiCaSchedulerNode node = scheduler.getNode(rmContainer.getContainer().getNodeId()); allocateResource(clusterResource, rmContainer.getContainer().getResource(), node.getPartition(), false); + } finally { + writeLock.unlock(); } if (parent != null) { parent.recoverContainer(clusterResource, attempt, rmContainer); @@ -851,10 +932,15 @@ public ActiveUsersManager getActiveUsersManager() { } @Override - public synchronized void collectSchedulerApplications( + public void collectSchedulerApplications( Collection apps) { - for (CSQueue queue : childQueues) { - queue.collectSchedulerApplications(apps); + try { + readLock.lock(); + for (CSQueue queue : childQueues) { + queue.collectSchedulerApplications(apps); + } + } finally { + readLock.unlock(); } } @@ -897,44 +983,49 @@ public void detachContainer(Resource clusterResource, } } - public synchronized int getNumApplications() { + public int getNumApplications() { return numApplications; } - synchronized void allocateResource(Resource clusterResource, + void allocateResource(Resource clusterResource, Resource resource, String nodePartition, boolean changeContainerResource) { - super.allocateResource(clusterResource, resource, nodePartition, - changeContainerResource); - - /** - * check if we need to kill (killable) containers if maximum resource violated. - * Doing this because we will deduct killable resource when going from root. - * For example: - *
-     *      Root
-     *      /   \
-     *     a     b
-     *   /  \
-     *  a1  a2
-     * 
- * - * a: max=10G, used=10G, killable=2G - * a1: used=8G, killable=2G - * a2: used=2G, pending=2G, killable=0G - * - * When we get queue-a to allocate resource, even if queue-a - * reaches its max resource, we deduct its used by killable, so we can allocate - * at most 2G resources. ResourceLimits passed down to a2 has headroom set to 2G. - * - * If scheduler finds a 2G available resource in existing cluster, and assigns it - * to a2, now a2's used= 2G + 2G = 4G, and a's used = 8G + 4G = 12G > 10G - * - * When this happens, we have to preempt killable container (on same or different - * nodes) of parent queue to avoid violating parent's max resource. - */ - if (getQueueCapacities().getAbsoluteMaximumCapacity(nodePartition) - < getQueueCapacities().getAbsoluteUsedCapacity(nodePartition)) { - killContainersToEnforceMaxQueueCapacity(nodePartition, clusterResource); + try { + writeLock.lock(); + super.allocateResource(clusterResource, resource, nodePartition, + changeContainerResource); + + /** + * check if we need to kill (killable) containers if maximum resource violated. + * Doing this because we will deduct killable resource when going from root. + * For example: + *
+       *      Root
+       *      /   \
+       *     a     b
+       *   /  \
+       *  a1  a2
+       * 
+ * + * a: max=10G, used=10G, killable=2G + * a1: used=8G, killable=2G + * a2: used=2G, pending=2G, killable=0G + * + * When we get queue-a to allocate resource, even if queue-a + * reaches its max resource, we deduct its used by killable, so we can allocate + * at most 2G resources. ResourceLimits passed down to a2 has headroom set to 2G. + * + * If scheduler finds a 2G available resource in existing cluster, and assigns it + * to a2, now a2's used= 2G + 2G = 4G, and a's used = 8G + 4G = 12G > 10G + * + * When this happens, we have to preempt killable container (on same or different + * nodes) of parent queue to avoid violating parent's max resource. + */ + if (getQueueCapacities().getAbsoluteMaximumCapacity(nodePartition) + < getQueueCapacities().getAbsoluteUsedCapacity(nodePartition)) { + killContainersToEnforceMaxQueueCapacity(nodePartition, clusterResource); + } + } finally { + writeLock.unlock(); } } @@ -975,4 +1066,43 @@ private void killContainersToEnforceMaxQueueCapacity(String partition, } } } + + public void applyResourceCommitRequest(Resource cluster, + ResourceCommitRequest request) { + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationContext + allocation = request.getFirstAllocatedOrReservedContainer(); + SchedulerContainer + schedulerContainer = allocation.getAllocatedOrReservedContainer(); + + // Do not modify queue when allocation from reserved container + if (allocation.getAllocateFromReservedContainer() == null) { + try { + writeLock.lock(); + // Book-keeping + // Note: Update headroom to account for current allocation too... + allocateResource(cluster, + allocation.getAllocatedOrReservedResource(), + schedulerContainer.getNodePartition(), + allocation.isIncreasedAllocation()); + + /* + * TODO, update assignment information + */ + + LOG.info("assignedContainer" + " queue=" + getQueueName() + + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity=" + + getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed() + + " cluster=" + cluster); + } + finally { + writeLock.unlock(); + } + } + } + + if (parent != null) { + ((ParentQueue) parent).applyResourceCommitRequest(cluster, request); + } + } } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java index fa13df4..99eaf00 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java @@ -24,6 +24,7 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesLogger; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesManager; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityDiagnosticConstant; @@ -31,6 +32,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; @@ -65,7 +67,7 @@ public AbstractContainerAllocator(FiCaSchedulerApp application, protected CSAssignment getCSAssignmentFromAllocateResult( Resource clusterResource, ContainerAllocation result, - RMContainer rmContainer, FiCaSchedulerNode node) { + RMContainer rmContainer, SchedulerNode node) { // Handle skipped CSAssignment.SkippedType skipped = (result.getAllocationState() == AllocationState.APP_SKIPPED) ? @@ -81,7 +83,7 @@ protected CSAssignment getCSAssignmentFromAllocateResult( if (Resources.greaterThan(rc, clusterResource, result.getResourceToBeAllocated(), Resources.none())) { Resource allocatedResource = result.getResourceToBeAllocated(); - Container updatedContainer = result.getUpdatedContainer(); + RMContainer updatedContainer = result.getUpdatedContainer(); assignment.setResource(allocatedResource); assignment.setType(result.getContainerNodeType()); @@ -92,7 +94,7 @@ protected CSAssignment getCSAssignmentFromAllocateResult( + application.getApplicationId() + " resource=" + allocatedResource + " queue=" + this.toString() + " cluster=" + clusterResource); assignment.getAssignmentInformation().addReservationDetails( - updatedContainer.getId(), + updatedContainer, application.getCSLeafQueue().getQueuePath()); assignment.getAssignmentInformation().incrReservations(); Resources.addTo(assignment.getAssignmentInformation().getReserved(), @@ -100,7 +102,7 @@ protected CSAssignment getCSAssignmentFromAllocateResult( if (rmContainer != null) { ActivitiesLogger.APP.recordAppActivityWithAllocation( - activitiesManager, node, application, updatedContainer, + activitiesManager, node, application, rmContainer, ActivityState.RE_RESERVED); ActivitiesLogger.APP.finishSkippedAppAllocationRecording( activitiesManager, application.getApplicationId(), @@ -111,25 +113,30 @@ protected CSAssignment getCSAssignmentFromAllocateResult( ActivityState.RESERVED); ActivitiesLogger.APP.finishAllocatedAppAllocationRecording( activitiesManager, application.getApplicationId(), - updatedContainer.getId(), ActivityState.RESERVED, + updatedContainer.getContainerId(), ActivityState.RESERVED, ActivityDiagnosticConstant.EMPTY); } } else if (result.getAllocationState() == AllocationState.ALLOCATED){ // This is a new container // Inform the ordering policy - LOG.info("assignedContainer" + " application attempt=" - + application.getApplicationAttemptId() + " container=" - + updatedContainer.getId() + " queue=" + this + " clusterResource=" + LOG.info("assignedContainer" + " application attempt=" + application + .getApplicationAttemptId() + " container=" + updatedContainer + .getContainerId() + " queue=" + this + " clusterResource=" + clusterResource + " type=" + assignment.getType()); + /* + * TODO, fix this + */ + /* application .getCSLeafQueue() .getOrderingPolicy() .containerAllocated(application, - application.getRMContainer(updatedContainer.getId())); + application.getRMContainer(updatedContainer.getContainerId())); + */ assignment.getAssignmentInformation().addAllocationDetails( - updatedContainer.getId(), + updatedContainer, application.getCSLeafQueue().getQueuePath()); assignment.getAssignmentInformation().incrAllocations(); Resources.addTo(assignment.getAssignmentInformation().getAllocated(), @@ -137,13 +144,14 @@ protected CSAssignment getCSAssignmentFromAllocateResult( if (rmContainer != null) { assignment.setFulfilledReservation(true); + assignment.setFulfilledReservedContainer(rmContainer); } ActivitiesLogger.APP.recordAppActivityWithAllocation(activitiesManager, node, application, updatedContainer, ActivityState.ALLOCATED); ActivitiesLogger.APP.finishAllocatedAppAllocationRecording( activitiesManager, application.getApplicationId(), - updatedContainer.getId(), ActivityState.ACCEPTED, + updatedContainer.getContainerId(), ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY); } @@ -172,6 +180,6 @@ protected CSAssignment getCSAssignmentFromAllocateResult( * */ public abstract CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, SchedulingMode schedulingMode, + PlacementSet placementSet, SchedulingMode schedulingMode, ResourceLimits resourceLimits, RMContainer reservedContainer); } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java index 8f749f6..6ace801 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java @@ -23,6 +23,8 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.Resources; import java.util.List; @@ -58,8 +60,9 @@ AllocationState state; NodeType containerNodeType = NodeType.NODE_LOCAL; NodeType requestNodeType = NodeType.NODE_LOCAL; - Container updatedContainer; + RMContainer updatedContainer; private List toKillContainers; + FiCaSchedulerNode nodeToAllocate; public ContainerAllocation(RMContainer containerToBeUnreserved, Resource resourceToBeAllocated, AllocationState state) { @@ -87,7 +90,7 @@ public NodeType getContainerNodeType() { return containerNodeType; } - public Container getUpdatedContainer() { + public RMContainer getUpdatedContainer() { return updatedContainer; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocator.java index 4eaa24b..6231b90 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocator.java @@ -26,8 +26,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; @@ -52,17 +52,17 @@ public ContainerAllocator(FiCaSchedulerApp application, ResourceCalculator rc, @Override public CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, SchedulingMode schedulingMode, + PlacementSet placementSet, SchedulingMode schedulingMode, ResourceLimits resourceLimits, RMContainer reservedContainer) { if (reservedContainer != null) { if (reservedContainer.getState() == RMContainerState.RESERVED) { // It's a regular container return regularContainerAllocator.assignContainers(clusterResource, - node, schedulingMode, resourceLimits, reservedContainer); + placementSet, schedulingMode, resourceLimits, reservedContainer); } else { // It's a increase container return increaseContainerAllocator.assignContainers(clusterResource, - node, schedulingMode, resourceLimits, reservedContainer); + placementSet, schedulingMode, resourceLimits, reservedContainer); } } else { /* @@ -70,14 +70,16 @@ public CSAssignment assignContainers(Resource clusterResource, * anything, we will try to allocate regular container */ CSAssignment assign = - increaseContainerAllocator.assignContainers(clusterResource, node, + increaseContainerAllocator.assignContainers(clusterResource, + placementSet, schedulingMode, resourceLimits, null); if (Resources.greaterThan(rc, clusterResource, assign.getResource(), Resources.none())) { return assign; } - return regularContainerAllocator.assignContainers(clusterResource, node, + return regularContainerAllocator.assignContainers(clusterResource, + placementSet, schedulingMode, resourceLimits, null); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/IncreaseContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/IncreaseContainerAllocator.java index 509dfba..283b905 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/IncreaseContainerAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/IncreaseContainerAllocator.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; @@ -76,7 +77,7 @@ private CSAssignment createReservedIncreasedCSAssignment( request.getDeltaCapacity()); assignment.getAssignmentInformation().incrReservations(); assignment.getAssignmentInformation().addReservationDetails( - request.getContainerId(), application.getCSLeafQueue().getQueuePath()); + request.getRMContainer(), application.getCSLeafQueue().getQueuePath()); assignment.setIncreasedAllocation(true); LOG.info("Reserved increase container request:" + request.toString()); @@ -93,7 +94,7 @@ private CSAssignment createSuccessfullyIncreasedCSAssignment( request.getDeltaCapacity()); assignment.getAssignmentInformation().incrAllocations(); assignment.getAssignmentInformation().addAllocationDetails( - request.getContainerId(), application.getCSLeafQueue().getQueuePath()); + request.getRMContainer(), application.getCSLeafQueue().getQueuePath()); assignment.setIncreasedAllocation(true); // notify application @@ -175,17 +176,23 @@ private CSAssignment allocateIncreaseRequest(FiCaSchedulerNode node, @Override public CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, SchedulingMode schedulingMode, + PlacementSet placementSet, SchedulingMode schedulingMode, ResourceLimits resourceLimits, RMContainer reservedContainer) { AppSchedulingInfo sinfo = application.getAppSchedulingInfo(); - NodeId nodeId = node.getNodeID(); + if (null == placementSet.getNextAvailable()) { + // TODO, fix IncreaseContainerAllocator to be able to schedule for + // global scheduling + return CSAssignment.SKIP_ASSIGNMENT; + } + + NodeId nodeId = placementSet.getNextAvailable().getNodeID(); if (reservedContainer == null) { // Do we have increase request on this node? if (!sinfo.hasIncreaseRequest(nodeId)) { if (LOG.isDebugEnabled()) { LOG.debug("Skip allocating increase request since we don't have any" - + " increase request on this node=" + node.getNodeID()); + + " increase request on this node=" + nodeId); } return CSAssignment.SKIP_ASSIGNMENT; @@ -294,7 +301,8 @@ public CSAssignment assignContainers(Resource clusterResource, } if (!Resources.fitsIn(rc, clusterResource, - increaseRequest.getTargetCapacity(), node.getTotalResource())) { + increaseRequest.getTargetCapacity(), + placementSet.getNextAvailable().getTotalResource())) { // if the target capacity is more than what the node can offer, we // will simply remove and skip it. // The reason of doing check here instead of adding increase request @@ -302,17 +310,19 @@ public CSAssignment assignContainers(Resource clusterResource, // request added. if (LOG.isDebugEnabled()) { LOG.debug(" Target capacity is more than what node can offer," - + " node.resource=" + node.getTotalResource()); + + " node.resource=" + placementSet.getNextAvailable() + .getTotalResource()); } toBeRemovedRequests.add(increaseRequest); continue; } // Try to allocate the increase request - assigned = - allocateIncreaseRequest(node, clusterResource, increaseRequest); - if (assigned.getSkippedType() - == CSAssignment.SkippedType.NONE) { + assigned = allocateIncreaseRequest( + (FiCaSchedulerNode) placementSet.getNextAvailable(), + clusterResource, + increaseRequest); + if (assigned.getSkippedType() == CSAssignment.SkippedType.NONE) { // When we don't skip this request, which means we either allocated // OR reserved this request. We will break break; @@ -364,9 +374,9 @@ public CSAssignment assignContainers(Resource clusterResource, // We don't need this container now, just return excessive reservation return new CSAssignment(application, reservedContainer); } - - return allocateIncreaseRequestFromReservedContainer(node, clusterResource, - request); + + return allocateIncreaseRequestFromReservedContainer( + placementSet.getNextAvailable(), clusterResource, request); } } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java index 8d4042c..f53d1f1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,8 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator; -import java.util.ArrayList; -import java.util.List; +import org.apache.commons.collections.IteratorUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; @@ -34,9 +33,11 @@ import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppUtils; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; @@ -48,19 +49,27 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAMContainerLaunchDiagnosticsConstants; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer.SchedulerNodesScorer; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer.SchedulerNodesScorerCache; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + /** * Allocate normal (new) containers, considers locality/label, etc. Using * delayed scheduling mechanism to get better locality allocation. */ public class RegularContainerAllocator extends AbstractContainerAllocator { - private static final Log LOG = LogFactory.getLog(RegularContainerAllocator.class); - + private static final Log LOG = LogFactory.getLog( + RegularContainerAllocator.class); + private ResourceRequest lastResourceRequest = null; public RegularContainerAllocator(FiCaSchedulerApp application, @@ -68,40 +77,32 @@ public RegularContainerAllocator(FiCaSchedulerApp application, ActivitiesManager activitiesManager) { super(application, rc, rmContext, activitiesManager); } - - private boolean checkHeadroom(Resource clusterResource, + + private boolean checkHeadroomForPartition(Resource clusterResource, ResourceLimits currentResourceLimits, Resource required, - FiCaSchedulerNode node) { + String partition) { // If headroom + currentReservation < required, we cannot allocate this // require Resource resourceCouldBeUnReserved = application.getCurrentReservation(); if (!application.getCSLeafQueue().getReservationContinueLooking() - || !node.getPartition().equals(RMNodeLabelsManager.NO_LABEL)) { + || partition.equals(RMNodeLabelsManager.NO_LABEL)) { // If we don't allow reservation continuous looking, OR we're looking at // non-default node partition, we won't allow to unreserve before // allocation. resourceCouldBeUnReserved = Resources.none(); } - return Resources.greaterThanOrEqual(rc, clusterResource, Resources.add( - currentResourceLimits.getHeadroom(), resourceCouldBeUnReserved), + return Resources.greaterThanOrEqual(rc, clusterResource, Resources + .add(currentResourceLimits.getHeadroom(), resourceCouldBeUnReserved), required); } - private ContainerAllocation preCheckForNewContainer(Resource clusterResource, - FiCaSchedulerNode node, SchedulingMode schedulingMode, - ResourceLimits resourceLimits, SchedulerRequestKey schedulerKey) { + String partition, SchedulingMode schedulingMode, + ResourceLimits resourceLimits, SchedulerRequestKey schedulerKey, + PlacementSet placementSet) { + SchedulerNode node = placementSet.getNextAvailable(); Priority priority = schedulerKey.getPriority(); - if (SchedulerAppUtils.isPlaceBlacklisted(application, node, LOG)) { - application.updateAppSkipNodeDiagnostics( - CSAMContainerLaunchDiagnosticsConstants.SKIP_AM_ALLOCATION_IN_BLACK_LISTED_NODE); - ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation( - activitiesManager, node, application, priority, - ActivityDiagnosticConstant.SKIP_BLACK_LISTED_NODE); - return ContainerAllocation.APP_SKIPPED; - } - ResourceRequest anyRequest = application.getResourceRequest(schedulerKey, ResourceRequest.ANY); if (null == anyRequest) { @@ -127,8 +128,8 @@ private ContainerAllocation preCheckForNewContainer(Resource clusterResource, if (schedulingMode == SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY) { if (application.isWaitingForAMContainer()) { if (LOG.isDebugEnabled()) { - LOG.debug("Skip allocating AM container to app_attempt=" - + application.getApplicationAttemptId() + LOG.debug("Skip allocating AM container to app_attempt=" + application + .getApplicationAttemptId() + ", don't allow to allocate AM container in non-exclusive mode"); } application.updateAppSkipNodeDiagnostics( @@ -144,7 +145,7 @@ private ContainerAllocation preCheckForNewContainer(Resource clusterResource, // matches the node's label? // If not match, jump to next priority. if (!SchedulerUtils.checkResourceRequestMatchingNodePartition( - anyRequest.getNodeLabelExpression(), node.getPartition(), + anyRequest.getNodeLabelExpression(), placementSet.getPartition(), schedulingMode)) { ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation( activitiesManager, node, application, priority, @@ -165,7 +166,8 @@ private ContainerAllocation preCheckForNewContainer(Resource clusterResource, } } - if (!checkHeadroom(clusterResource, resourceLimits, required, node)) { + if (!checkHeadroomForPartition(clusterResource, resourceLimits, required, + partition)) { if (LOG.isDebugEnabled()) { LOG.debug("cannot allocate required resource=" + required + " because of headroom"); @@ -183,8 +185,8 @@ private ContainerAllocation preCheckForNewContainer(Resource clusterResource, // This is to make sure non-partitioned-resource-request will prefer // to be allocated to non-partitioned nodes int missedNonPartitionedRequestSchedulingOpportunity = 0; - if (anyRequest.getNodeLabelExpression() - .equals(RMNodeLabelsManager.NO_LABEL)) { + if (anyRequest.getNodeLabelExpression().equals( + RMNodeLabelsManager.NO_LABEL)) { missedNonPartitionedRequestSchedulingOpportunity = application.addMissedNonPartitionedRequestSchedulingOpportunity( schedulerKey); @@ -210,7 +212,7 @@ private ContainerAllocation preCheckForNewContainer(Resource clusterResource, return ContainerAllocation.APP_SKIPPED; } } - + return null; } @@ -218,16 +220,7 @@ ContainerAllocation preAllocation(Resource clusterResource, FiCaSchedulerNode node, SchedulingMode schedulingMode, ResourceLimits resourceLimits, SchedulerRequestKey schedulerKey, RMContainer reservedContainer) { - ContainerAllocation result; - if (null == reservedContainer) { - // pre-check when allocating new container - result = - preCheckForNewContainer(clusterResource, node, schedulingMode, - resourceLimits, schedulerKey); - if (null != result) { - return result; - } - } else { + if (null != reservedContainer) { // pre-check when allocating reserved container if (application.getTotalRequiredResources(schedulerKey) == 0) { // Release @@ -236,18 +229,20 @@ ContainerAllocation preAllocation(Resource clusterResource, } } + ContainerAllocation result; // Try to allocate containers on node result = assignContainersOnNode(clusterResource, node, schedulerKey, reservedContainer, schedulingMode, resourceLimits); - + if (null == reservedContainer) { if (result.state == AllocationState.PRIORITY_SKIPPED) { // Don't count 'skipped nodes' as a scheduling opportunity! application.subtractSchedulingOpportunity(schedulerKey); } } - + result.nodeToAllocate = node; + return result; } @@ -256,15 +251,15 @@ public synchronized float getLocalityWaitFactor( // Estimate: Required unique resources (i.e. hosts + racks) int requiredResources = Math.max(application.getResourceRequests(schedulerKey).size() - 1, 0); - + // waitFactor can't be more than '1' // i.e. no point skipping more than clustersize opportunities - return Math.min(((float)requiredResources / clusterNodes), 1.0f); + return Math.min(((float) requiredResources / clusterNodes), 1.0f); } - + private int getActualNodeLocalityDelay() { - return Math.min(rmContext.getScheduler().getNumClusterNodes(), application - .getCSLeafQueue().getNodeLocalityDelay()); + return Math.min(rmContext.getScheduler().getNumClusterNodes(), + application.getCSLeafQueue().getNodeLocalityDelay()); } private boolean canAssign(SchedulerRequestKey schedulerKey, @@ -382,6 +377,15 @@ private ContainerAllocation assignContainersOnNode(Resource clusterResource, ResourceLimits currentResoureLimits) { Priority priority = schedulerKey.getPriority(); + if (SchedulerAppUtils.isPlaceBlacklisted(application, node, LOG)) { + application.updateAppSkipNodeDiagnostics( + CSAMContainerLaunchDiagnosticsConstants.SKIP_AM_ALLOCATION_IN_BLACK_LISTED_NODE); + ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation( + activitiesManager, node, application, priority, + ActivityDiagnosticConstant.SKIP_BLACK_LISTED_NODE); + return ContainerAllocation.APP_SKIPPED; + } + ContainerAllocation allocation; NodeType requestType = null; @@ -447,7 +451,7 @@ private ContainerAllocation assignContainersOnNode(Resource clusterResource, node, schedulerKey, reservedContainer, schedulingMode, currentResoureLimits); allocation.requestNodeType = requestType; - + // When a returned allocation is LOCALITY_SKIPPED, since we're in // off-switch request now, we will skip this app w.r.t priorities if (allocation.state == AllocationState.LOCALITY_SKIPPED) { @@ -468,7 +472,7 @@ private ContainerAllocation assignContainer(Resource clusterResource, SchedulingMode schedulingMode, ResourceLimits currentResoureLimits) { Priority priority = schedulerKey.getPriority(); lastResourceRequest = request; - + if (LOG.isDebugEnabled()) { LOG.debug("assignContainers: node=" + node.getNodeName() + " application=" + application.getApplicationId() @@ -495,8 +499,8 @@ private ContainerAllocation assignContainer(Resource clusterResource, Resource available = node.getUnallocatedResource(); Resource totalResource = node.getTotalResource(); - if (!Resources.lessThanOrEqual(rc, clusterResource, - capability, totalResource)) { + if (!Resources.lessThanOrEqual(rc, clusterResource, capability, + totalResource)) { LOG.warn("Node : " + node.getNodeID() + " does not have sufficient resource for request : " + request + " node total capability : " + node.getTotalResource()); @@ -516,14 +520,12 @@ private ContainerAllocation assignContainer(Resource clusterResource, // How much need to unreserve equals to: // max(required - headroom, amountNeedUnreserve) - Resource resourceNeedToUnReserve = - Resources.max(rc, clusterResource, - Resources.subtract(capability, currentResoureLimits.getHeadroom()), - currentResoureLimits.getAmountNeededUnreserve()); + Resource resourceNeedToUnReserve = Resources.max(rc, clusterResource, + Resources.subtract(capability, currentResoureLimits.getHeadroom()), + currentResoureLimits.getAmountNeededUnreserve()); - boolean needToUnreserve = - Resources.greaterThan(rc, clusterResource, - resourceNeedToUnReserve, Resources.none()); + boolean needToUnreserve = Resources.greaterThan(rc, clusterResource, + resourceNeedToUnReserve, Resources.none()); RMContainer unreservedContainer = null; boolean reservationsContinueLooking = @@ -533,18 +535,16 @@ private ContainerAllocation assignContainer(Resource clusterResource, List toKillContainers = null; if (availableContainers == 0 && currentResoureLimits.isAllowPreemption()) { Resource availableAndKillable = Resources.clone(available); - for (RMContainer killableContainer : node - .getKillableContainers().values()) { + for (RMContainer killableContainer : node.getKillableContainers() + .values()) { if (null == toKillContainers) { toKillContainers = new ArrayList<>(); } toKillContainers.add(killableContainer); Resources.addTo(availableAndKillable, - killableContainer.getAllocatedResource()); - if (Resources.fitsIn(rc, - clusterResource, - capability, - availableAndKillable)) { + killableContainer.getAllocatedResource()); + if (Resources.fitsIn(rc, clusterResource, capability, + availableAndKillable)) { // Stop if we find enough spaces availableContainers = 1; break; @@ -556,8 +556,8 @@ private ContainerAllocation assignContainer(Resource clusterResource, // Allocate... // We will only do continuous reservation when this is not allocated from // reserved container - if (rmContainer == null && reservationsContinueLooking - && node.getLabels().isEmpty()) { + if (rmContainer == null && reservationsContinueLooking && node.getLabels() + .isEmpty()) { // when reservationsContinueLooking is set, we may need to unreserve // some containers to meet this queue, its parents', or the users' // resource limits. @@ -572,9 +572,11 @@ private ContainerAllocation assignContainer(Resource clusterResource, // under the limit. resourceNeedToUnReserve = capability; } + unreservedContainer = application.findNodeToUnreserve(clusterResource, node, schedulerKey, resourceNeedToUnReserve); + // When (minimum-unreserved-resource > 0 OR we cannot allocate // new/reserved // container (That means we *have to* unreserve some resource to @@ -590,9 +592,8 @@ private ContainerAllocation assignContainer(Resource clusterResource, } } - ContainerAllocation result = - new ContainerAllocation(unreservedContainer, request.getCapability(), - AllocationState.ALLOCATED); + ContainerAllocation result = new ContainerAllocation(unreservedContainer, + request.getCapability(), AllocationState.ALLOCATED); result.containerNodeType = type; result.setToKillContainers(toKillContainers); return result; @@ -612,13 +613,12 @@ private ContainerAllocation assignContainer(Resource clusterResource, ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation( activitiesManager, node, application, priority, ActivityDiagnosticConstant.LOCALITY_SKIPPED); - return ContainerAllocation.LOCALITY_SKIPPED; + return ContainerAllocation.LOCALITY_SKIPPED; } } - ContainerAllocation result = - new ContainerAllocation(null, request.getCapability(), - AllocationState.RESERVED); + ContainerAllocation result = new ContainerAllocation(null, + request.getCapability(), AllocationState.RESERVED); result.containerNodeType = type; result.setToKillContainers(null); return result; @@ -627,7 +627,7 @@ private ContainerAllocation assignContainer(Resource clusterResource, ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation( activitiesManager, node, application, priority, ActivityDiagnosticConstant.LOCALITY_SKIPPED); - return ContainerAllocation.LOCALITY_SKIPPED; + return ContainerAllocation.LOCALITY_SKIPPED; } } @@ -661,7 +661,7 @@ boolean shouldAllocOrReserveNewContainer( } return (((starvation + requiredContainers) - reservedContainers) > 0); } - + private Container getContainer(RMContainer rmContainer, FiCaSchedulerNode node, Resource capability, SchedulerRequestKey schedulerKey) { @@ -673,27 +673,31 @@ private Container createContainer(FiCaSchedulerNode node, Resource capability, SchedulerRequestKey schedulerKey) { NodeId nodeId = node.getRMNode().getNodeID(); - ContainerId containerId = - BuilderUtils.newContainerId(application.getApplicationAttemptId(), - application.getNewContainerId()); // Create the container - return BuilderUtils.newContainer(containerId, nodeId, + return BuilderUtils.newContainer(null, nodeId, node.getRMNode().getHttpAddress(), capability, schedulerKey.getPriority(), null, schedulerKey.getAllocationRequestId()); } - + + /* + * Add a new acceptResourceCommitRequest method to write used resource, reserve container, etc. + * + * And when acceptResourceCommitRequest fails, ResourceRequests will be recovered + */ private ContainerAllocation handleNewContainerAllocation( ContainerAllocation allocationResult, FiCaSchedulerNode node, SchedulerRequestKey schedulerKey, RMContainer reservedContainer, Container container) { + /* // Handling container allocation // Did we previously reserve containers at this 'priority'? if (reservedContainer != null) { application.unreserve(schedulerKey, node, reservedContainer); } - + */ + // Inform the application RMContainer allocatedContainer = application.allocate(allocationResult.containerNodeType, node, @@ -711,14 +715,18 @@ private ContainerAllocation handleNewContainerAllocation( return ret; } + allocationResult.updatedContainer = allocatedContainer; + + /* // Inform the node node.allocateContainer(allocatedContainer); - + // update locality statistics application.incNumAllocatedContainers(allocationResult.containerNodeType, allocationResult.requestNodeType); - - return allocationResult; + */ + + return allocationResult; } ContainerAllocation doAllocation(ContainerAllocation allocationResult, @@ -731,8 +739,8 @@ ContainerAllocation doAllocation(ContainerAllocation allocationResult, // something went wrong getting/creating the container if (container == null) { - application - .updateAppSkipNodeDiagnostics("Scheduling of container failed. "); + application.updateAppSkipNodeDiagnostics( + "Scheduling of container failed. "); LOG.warn("Couldn't get container for allocation!"); ActivitiesLogger.APP.recordAppActivityWithoutAllocation(activitiesManager, node, application, schedulerKey.getPriority(), @@ -747,10 +755,14 @@ ContainerAllocation doAllocation(ContainerAllocation allocationResult, handleNewContainerAllocation(allocationResult, node, schedulerKey, reservedContainer, container); } else { - // When reserving container - application.reserve(schedulerKey, node, reservedContainer, container); + RMContainer updatedContainer = reservedContainer; + if (updatedContainer == null) { + updatedContainer = new RMContainerImpl(container, + application.getApplicationAttemptId(), node.getNodeID(), + application.getAppSchedulingInfo().getUser(), rmContext); + } + allocationResult.updatedContainer = updatedContainer; } - allocationResult.updatedContainer = container; // Only reset opportunities when we FIRST allocate the container. (IAW, When // reservedContainer != null, it's not the first time) @@ -788,53 +800,85 @@ ContainerAllocation doAllocation(ContainerAllocation allocationResult, } private ContainerAllocation allocate(Resource clusterResource, - FiCaSchedulerNode node, SchedulingMode schedulingMode, + PlacementSet placementSet, SchedulingMode schedulingMode, ResourceLimits resourceLimits, SchedulerRequestKey schedulerKey, RMContainer reservedContainer) { - ContainerAllocation result = - preAllocation(clusterResource, node, schedulingMode, resourceLimits, - schedulerKey, reservedContainer); + // Check partition resource + if (null == reservedContainer) { + // pre-check when allocating new container + ContainerAllocation result = preCheckForNewContainer(clusterResource, + placementSet.getPartition(), schedulingMode, resourceLimits, + schedulerKey, placementSet); + if (null != result) { + return result; + } + } + + // When trying to allocate reserved container, only look at reserved node, + // otherwise look at nodes ordered by scorer + Iterator iter; + if (null == reservedContainer && application.getCapacitySchedulerContext() + .globalSchedulingEnabled()) { + SchedulerNodesScorer scorer = + SchedulerNodesScorerCache.getOrCreateScorer(application, schedulerKey); + iter = scorer.scorePlacementSet(placementSet); + } else { + iter = IteratorUtils.singletonIterator( + placementSet.getNextAvailable()); + } + + ContainerAllocation result = ContainerAllocation.PRIORITY_SKIPPED; + while (iter.hasNext()) { + FiCaSchedulerNode node = iter.next(); + + // FIXME: part of the preAllocation can be extracted to avoid duplicated + // check for resource-requests across nodes. + result = preAllocation(clusterResource, node, schedulingMode, + resourceLimits, schedulerKey, reservedContainer); - if (AllocationState.ALLOCATED == result.state - || AllocationState.RESERVED == result.state) { - result = doAllocation(result, node, schedulerKey, reservedContainer); + if (AllocationState.ALLOCATED == result.state + || AllocationState.RESERVED == result.state) { + result = doAllocation(result, node, schedulerKey, reservedContainer); + break; + } } return result; } - + @Override public CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, SchedulingMode schedulingMode, - ResourceLimits resourceLimits, - RMContainer reservedContainer) { + PlacementSet placementSet, SchedulingMode schedulingMode, + ResourceLimits resourceLimits, RMContainer reservedContainer) { + SchedulerNode node = placementSet.getNextAvailable(); + if (reservedContainer == null) { // Check if application needs more resource, skip if it doesn't need more. if (!application.hasPendingResourceRequest(rc, - node.getPartition(), clusterResource, schedulingMode)) { + placementSet.getPartition(), clusterResource, + schedulingMode)) { if (LOG.isDebugEnabled()) { LOG.debug("Skip app_attempt=" + application.getApplicationAttemptId() + ", because it doesn't need more resource, schedulingMode=" - + schedulingMode.name() + " node-label=" + node.getPartition()); + + schedulingMode.name() + " node-label=" + placementSet + .getPartition()); } ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation( activitiesManager, node, application, application.getPriority(), ActivityDiagnosticConstant.APPLICATION_DO_NOT_NEED_RESOURCE); return CSAssignment.SKIP_ASSIGNMENT; } - + // Schedule in priority order for (SchedulerRequestKey schedulerKey : application.getSchedulerKeys()) { - ContainerAllocation result = - allocate(clusterResource, node, schedulingMode, resourceLimits, - schedulerKey, null); + ContainerAllocation result = allocate(clusterResource, placementSet, + schedulingMode, resourceLimits, schedulerKey, null); AllocationState allocationState = result.getAllocationState(); if (allocationState == AllocationState.PRIORITY_SKIPPED) { continue; } - return getCSAssignmentFromAllocateResult(clusterResource, result, - null, node); + return getCSAssignmentFromAllocateResult(clusterResource, result, null, node); } // We will reach here if we skipped all priorities of the app, so we will @@ -844,9 +888,8 @@ public CSAssignment assignContainers(Resource clusterResource, ActivityDiagnosticConstant.SKIPPED_ALL_PRIORITIES); return CSAssignment.SKIP_ASSIGNMENT; } else { - ContainerAllocation result = - allocate(clusterResource, node, schedulingMode, resourceLimits, - reservedContainer.getReservedSchedulerKey(), reservedContainer); + ContainerAllocation result = allocate(clusterResource, placementSet, schedulingMode, resourceLimits, + reservedContainer.getReservedSchedulerKey(), reservedContainer); return getCSAssignmentFromAllocateResult(clusterResource, result, reservedContainer, node); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ContainerAllocationContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ContainerAllocationContext.java new file mode 100644 index 0000000..e144f5b --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ContainerAllocationContext.java @@ -0,0 +1,113 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer; + +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; + +import java.util.Collections; +import java.util.List; + +/** + * Contexts to allocate a new container + */ +public class ContainerAllocationContext { + // Container we allocated or reserved + private SchedulerContainer allocatedOrReservedContainer; + + // Containers we need to release before allocating or reserving the + // new container + private List> toRelease; + + // When trying to allocate from a reserved container, set this, and this will + // not be included by toRelease list + private SchedulerContainer allocateFromReservedContainer; + + private boolean isIncreasedAllocation; + + private NodeType nodeType; + + private SchedulingMode schedulingMode; + + private Resource allocatedResource; // newly allocated resource + + public ContainerAllocationContext( + SchedulerContainer allocatedOrReservedContainer, + List> toRelease, + SchedulerContainer allocateFromReservedContainer, + boolean isIncreasedAllocation, NodeType nodeType, + SchedulingMode schedulingMode, Resource allocatedResource) { + this.allocatedOrReservedContainer = allocatedOrReservedContainer; + if (null != toRelease) { + this.toRelease = Collections.emptyList(); + } + this.allocateFromReservedContainer = allocateFromReservedContainer; + this.isIncreasedAllocation = isIncreasedAllocation; + this.nodeType = nodeType; + this.schedulingMode = schedulingMode; + this.allocatedResource = allocatedResource; + } + + public SchedulingMode getSchedulingMode() { + return schedulingMode; + } + + public void setSchedulingMode(SchedulingMode schedulingMode) { + this.schedulingMode = schedulingMode; + } + + public Resource getAllocatedOrReservedResource() { + return allocatedResource; + } + + + public void setAllocatedResource(Resource allocatedResource) { + this.allocatedResource = allocatedResource; + } + + + public NodeType getNodeType() { + return nodeType; + } + + public void setNodeType(NodeType nodeType) { + this.nodeType = nodeType; + } + + public boolean isIncreasedAllocation() { + return isIncreasedAllocation; + } + + public void setIncreasedAllocation(boolean increasedAllocation) { + isIncreasedAllocation = increasedAllocation; + } + + public SchedulerContainer getAllocateFromReservedContainer() { + return allocateFromReservedContainer; + } + + public void setAllocateFromReservedContainer( + SchedulerContainer allocateFromReservedContainer) { + this.allocateFromReservedContainer = allocateFromReservedContainer; + } + + public SchedulerContainer getAllocatedOrReservedContainer() { + return allocatedOrReservedContainer; + } + + public void setAllocatedOrReservedContainer( + SchedulerContainer allocatedOrReservedContainer) { + this.allocatedOrReservedContainer = allocatedOrReservedContainer; + } + + public List> getToRelease() { + return toRelease; + } + + public void setToRelease(List> toRelease) { + this.toRelease = toRelease; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitRequest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitRequest.java new file mode 100644 index 0000000..b6939b0 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitRequest.java @@ -0,0 +1,144 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer; + +import com.google.common.collect.Lists; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; +import org.apache.hadoop.yarn.util.resource.Resources; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ResourceCommitRequest { + // New containers to be allocated + private List> containersToAllocate = + Collections.emptyList(); + + // New containers to be released + private List> containersToReserve = + Collections.emptyList(); + + // We don't need these containers anymore + private List> toReleaseContainers = + Collections.emptyList(); + + private Resource totalAllocatedResource; + private Resource totalReservedResource; + private Resource totalReleasedResource; + + public ResourceCommitRequest( + List> containersToAllocate, + List> containersToReserve, + List> toReleaseContainers) { + if (null != containersToAllocate) { + this.containersToAllocate = containersToAllocate; + } + if (null != containersToReserve) { + this.containersToReserve = containersToReserve; + } + if (null != toReleaseContainers) { + this.toReleaseContainers = toReleaseContainers; + } + + totalAllocatedResource = Resources.createResource(0); + totalReservedResource = Resources.createResource(0); + totalReleasedResource = Resources.createResource(0); + + for (ContainerAllocationContext c : this.containersToAllocate) { + Resources.addTo(totalAllocatedResource, + c.getAllocatedOrReservedResource()); + for (SchedulerContainer r : c.getToRelease()) { + Resources.addTo(totalReleasedResource, + r.getRmContainer().getAllocatedOrReservedResource()); + } + } + + for (ContainerAllocationContext c : this.containersToReserve) { + Resources.addTo(totalReservedResource, + c.getAllocatedOrReservedResource()); + for (SchedulerContainer r : c.getToRelease()) { + Resources.addTo(totalReleasedResource, + r.getRmContainer().getAllocatedOrReservedResource()); + } + } + + for (SchedulerContainer r : this.toReleaseContainers) { + Resources.addTo(totalReleasedResource, + r.getRmContainer().getAllocatedOrReservedResource()); + } + } + + public List> getContainersToAllocate() { + return containersToAllocate; + } + + public void setContainersToAllocate( + List> containersToAllocate) { + this.containersToAllocate = containersToAllocate; + } + + public List> getContainersToReserve() { + return containersToReserve; + } + + public void setContainersToReserve( + List> containersToReserve) { + this.containersToReserve = containersToReserve; + } + + public List> getContainersToRelease() { + return toReleaseContainers; + } + + public void setContainersToRelease( + List> toReleaseContainers) { + this.toReleaseContainers = toReleaseContainers; + } + + public Resource getTotalAllocatedResource() { + return totalAllocatedResource; + } + + public void setTotalAllocatedResource(Resource totalAllocatedResource) { + this.totalAllocatedResource = totalAllocatedResource; + } + + public Resource getTotalReservedResource() { + return totalReservedResource; + } + + public void setTotalReservedResource(Resource totalReservedResource) { + this.totalReservedResource = totalReservedResource; + } + + public Resource getTotalReleasedResource() { + return totalReleasedResource; + } + + public void setTotalReleasedResource(Resource totalReleasedResource) { + this.totalReleasedResource = totalReleasedResource; + } + + /* + * Util functions to make your life easier + */ + public boolean anythingAllocatedOrReserved() { + return (!containersToAllocate.isEmpty()) || (!containersToReserve + .isEmpty()); + } + + public ContainerAllocationContext getFirstAllocatedOrReservedContainer() { + ContainerAllocationContext c = null; + if (!containersToAllocate.isEmpty()) { + c = containersToAllocate.get(0); + } + if (c == null && !containersToReserve.isEmpty()) { + c = containersToReserve.get(0); + } + + return c; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitResponse.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitResponse.java new file mode 100644 index 0000000..125cf47 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitResponse.java @@ -0,0 +1,32 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer; + +import java.util.List; + +public class ResourceCommitResponse { + private List acceptedAllocations; + private List rejectedAllocations; + + public ResourceCommitResponse(List acceptedAllocations, + List rejectedAllocations) { + this.acceptedAllocations = acceptedAllocations; + this.rejectedAllocations = rejectedAllocations; + } + + public List getAcceptedAllocations() { + return acceptedAllocations; + } + + public void setAcceptedAllocations( + List acceptedAllocations) { + this.acceptedAllocations = acceptedAllocations; + } + + public List getRejectedAllocations() { + return rejectedAllocations; + } + + public void setRejectedAllocations( + List rejectedAllocations) { + this.rejectedAllocations = rejectedAllocations; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitterHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitterHandler.java new file mode 100644 index 0000000..0735bcf --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/committer/ResourceCommitterHandler.java @@ -0,0 +1,72 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.event.EventHandler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; + +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; + +public class ResourceCommitterHandler + extends AbstractService { + private CapacityScheduler cs; + private Thread queueThread; + private final BlockingQueue> blockingQueue = + new LinkedBlockingQueue<>(); + private boolean asynchronizedHandler = false; + + private class HandlerRunnable implements Runnable { + + @Override + public void run() { + while (true) { + ResourceCommitRequest request = null; + try { + request = blockingQueue.take(); + cs.processResourceCommitRequest(request); + } catch (InterruptedException e) { + System.out.println("Interrupted"); + } + } + } + } + + // TODO: should not hard code using CS here + public ResourceCommitterHandler(CapacityScheduler cs) { + super(ResourceCommitterHandler.class.getName()); + this.cs = cs; + } + + @Override + protected void serviceInit(Configuration conf) throws Exception { + super.serviceInit(conf); + } + + @Override + protected void serviceStart() throws Exception { + if (asynchronizedHandler) { + queueThread = new Thread(new HandlerRunnable()); + queueThread.start(); + } + super.serviceStart(); + } + + @Override + protected void serviceStop() throws Exception { + if (asynchronizedHandler) { + queueThread.interrupt(); + } + super.serviceStop(); + } + + public void handle(ResourceCommitRequest request) { + if (asynchronizedHandler) { + blockingQueue.offer(request); + } else { + cs.processResourceCommitRequest(request); + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/AssignmentInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/AssignmentInformation.java index aad3bc7..063a9ae 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/AssignmentInformation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/AssignmentInformation.java @@ -38,11 +38,13 @@ } public static class AssignmentDetails { + public RMContainer rmContainer; public ContainerId containerId; public String queue; - public AssignmentDetails(ContainerId containerId, String queue) { - this.containerId = containerId; + public AssignmentDetails(RMContainer rmContainer, String queue) { + this.containerId = rmContainer.getContainerId(); + this.rmContainer = rmContainer; this.queue = queue; } } @@ -98,17 +100,17 @@ public Resource getReserved() { return operationResources.get(Operation.RESERVATION); } - private void addAssignmentDetails(Operation op, ContainerId containerId, + private void addAssignmentDetails(Operation op, RMContainer rmContainer, String queue) { - operationDetails.get(op).add(new AssignmentDetails(containerId, queue)); + operationDetails.get(op).add(new AssignmentDetails(rmContainer, queue)); } - public void addAllocationDetails(ContainerId containerId, String queue) { - addAssignmentDetails(Operation.ALLOCATION, containerId, queue); + public void addAllocationDetails(RMContainer rmContainer, String queue) { + addAssignmentDetails(Operation.ALLOCATION, rmContainer, queue); } - public void addReservationDetails(ContainerId containerId, String queue) { - addAssignmentDetails(Operation.RESERVATION, containerId, queue); + public void addReservationDetails(RMContainer rmContainer, String queue) { + addAssignmentDetails(Operation.RESERVATION, rmContainer, queue); } public List getAllocationDetails() { @@ -119,23 +121,31 @@ public void addReservationDetails(ContainerId containerId, String queue) { return operationDetails.get(Operation.RESERVATION); } - private ContainerId getFirstContainerIdFromOperation(Operation op) { + private RMContainer getFirstRMContainerFromOperation(Operation op) { if (null != operationDetails.get(op)) { List assignDetails = operationDetails.get(op); if (!assignDetails.isEmpty()) { - return assignDetails.get(0).containerId; + return assignDetails.get(0).rmContainer; } } return null; } + public RMContainer getFirstAllocatedOrReservedRMContainer() { + RMContainer rmContainer; + rmContainer = getFirstRMContainerFromOperation(Operation.ALLOCATION); + if (null != rmContainer) { + return rmContainer; + } + return getFirstRMContainerFromOperation(Operation.RESERVATION); + } + public ContainerId getFirstAllocatedOrReservedContainerId() { - ContainerId containerId; - containerId = getFirstContainerIdFromOperation(Operation.ALLOCATION); - if (null != containerId) { - return containerId; + RMContainer rmContainer = getFirstAllocatedOrReservedRMContainer(); + if (null != rmContainer) { + return rmContainer.getContainerId(); } - return getFirstContainerIdFromOperation(Operation.RESERVATION); + return null; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/PlacementSet.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/PlacementSet.java new file mode 100644 index 0000000..3ad7c4d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/PlacementSet.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common; + +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +public class PlacementSet { + private N nextAvailable; + private Map allSchedulableNodes; + private String partition; + + public PlacementSet(N nextAvailable, Map allSchedulable, + String partition) { + this.nextAvailable = nextAvailable; + this.allSchedulableNodes = allSchedulable; + this.partition = partition; + } + + /* + * "I don't care, just give me next node to allocate" + */ + public N getNextAvailable() { + return nextAvailable; + } + + /* + * "I'm picky, give me all you have and I will decide" + */ + public Map getAllSchedulableNodes() { + return allSchedulableNodes; + } + + public String getPartition() { + return partition; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/SchedulerContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/SchedulerContainer.java new file mode 100644 index 0000000..420a6b7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/SchedulerContainer.java @@ -0,0 +1,84 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common; + +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; + +/** + * Contexts for a container inside scheduler + */ +public class SchedulerContainer { + private RMContainer rmContainer; + private String nodePartition; + private A schedulerApplicationAttempt; + private N schedulerNode; + private boolean allocated; // Allocated (True) or reserved (False) + + public SchedulerContainer(A app, N node, RMContainer rmContainer, + String nodePartition, boolean allocated) { + this.schedulerApplicationAttempt = app; + this.schedulerNode = node; + this.rmContainer = rmContainer; + this.nodePartition = nodePartition; + + RMContainerState nowState = rmContainer.getState(); + if (nowState == RMContainerState.NEW) { + this.allocated = allocated; + } else { + this.allocated = nowState != RMContainerState.RESERVED; + } + } + + public String getNodePartition() { + return nodePartition; + } + + public void setNodePartition(String nodePartition) { + this.nodePartition = nodePartition; + } + + public RMContainer getRmContainer() { + return rmContainer; + } + + public void setRmContainer(RMContainer rmContainer) { + this.rmContainer = rmContainer; + } + + public A getSchedulerApplicationAttempt() { + return schedulerApplicationAttempt; + } + + public void setSchedulerApplicationAttempt(A schedulerApplicationAttempt) { + this.schedulerApplicationAttempt = schedulerApplicationAttempt; + } + + public N getSchedulerNode() { + return schedulerNode; + } + + public void setSchedulerNode(N schedulerNode) { + this.schedulerNode = schedulerNode; + } + + public boolean isAllocated() { + return allocated; + } + + public void setAllocated(boolean allocated) { + this.allocated = allocated; + } + + public SchedulerRequestKey getSchedulerRequestKey() { + if (rmContainer.getState() == RMContainerState.RESERVED) { + return rmContainer.getReservedSchedulerKey(); + } + return rmContainer.getAllocatedSchedulerKey(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java index 33dee80..8bf958d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java @@ -18,12 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; @@ -55,6 +50,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesManager; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAMContainerLaunchDiagnosticsConstants; @@ -67,11 +63,20 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator.AbstractContainerAllocator; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator.ContainerAllocator; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ContainerAllocationContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.committer.ResourceCommitRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; +import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; -import com.google.common.annotations.VisibleForTesting; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * Represents an application attempt from the viewpoint of the FIFO or Capacity @@ -85,7 +90,7 @@ private final Set containersToPreempt = new HashSet(); - private CapacityHeadroomProvider headroomProvider; + private volatile CapacityHeadroomProvider headroomProvider; private ResourceCalculator rc = new DefaultResourceCalculator(); @@ -97,7 +102,7 @@ * to hold the message if its app doesn't not get container from a node */ private String appSkipNodeDiagnostics; - private CapacitySchedulerContext capacitySchedulerContext; + private volatile CapacitySchedulerContext capacitySchedulerContext; public FiCaSchedulerApp(ApplicationAttemptId applicationAttemptId, String user, Queue queue, ActiveUsersManager activeUsersManager, @@ -157,112 +162,349 @@ public FiCaSchedulerApp(ApplicationAttemptId applicationAttemptId, } } - public synchronized boolean containerCompleted(RMContainer rmContainer, + public boolean containerCompleted(RMContainer rmContainer, ContainerStatus containerStatus, RMContainerEventType event, String partition) { - ContainerId containerId = rmContainer.getContainerId(); + try { + writeLock.lock(); + ContainerId containerId = rmContainer.getContainerId(); - // Remove from the list of containers - if (null == liveContainers.remove(containerId)) { - return false; - } + // Remove from the list of containers + if (null == liveContainers.remove(containerId)) { + return false; + } - // Remove from the list of newly allocated containers if found - newlyAllocatedContainers.remove(rmContainer); + // Remove from the list of newly allocated containers if found + newlyAllocatedContainers.remove(rmContainer); - // Inform the container - rmContainer.handle( - new RMContainerFinishedEvent(containerId, containerStatus, event)); + // Inform the container + rmContainer.handle( + new RMContainerFinishedEvent(containerId, containerStatus, event)); - containersToPreempt.remove(containerId); + containersToPreempt.remove(containerId); - Resource containerResource = rmContainer.getContainer().getResource(); - RMAuditLogger.logSuccess(getUser(), - AuditConstants.RELEASE_CONTAINER, "SchedulerApp", - getApplicationId(), containerId, containerResource); - - // Update usage metrics - queue.getMetrics().releaseResources(getUser(), 1, containerResource); - attemptResourceUsage.decUsed(partition, containerResource); + Resource containerResource = rmContainer.getContainer().getResource(); + RMAuditLogger.logSuccess(getUser(), AuditConstants.RELEASE_CONTAINER, + "SchedulerApp", getApplicationId(), containerId, containerResource); + + // Update usage metrics + queue.getMetrics().releaseResources(getUser(), 1, containerResource); + attemptResourceUsage.decUsed(partition, containerResource); - // Clear resource utilization metrics cache. - lastMemoryAggregateAllocationUpdateTime = -1; + // Clear resource utilization metrics cache. + lastResAllocationUpdateTime = -1; - return true; + return true; + } finally { + writeLock.unlock(); + } } - public synchronized RMContainer allocate(NodeType type, FiCaSchedulerNode node, + public RMContainer allocate(NodeType type, FiCaSchedulerNode node, SchedulerRequestKey schedulerKey, ResourceRequest request, Container container) { + try { + readLock.lock(); + if (isStopped) { + return null; + } - if (isStopped) { - return null; + // Required sanity check - AM can call 'allocate' to update resource + // request without locking the scheduler, hence we need to check + if (getTotalRequiredResources(schedulerKey) <= 0) { + return null; + } + + // Create RMContainer + RMContainer rmContainer = new RMContainerImpl(container, this.getApplicationAttemptId(), + node.getNodeID(), appSchedulingInfo.getUser(), this.rmContext, request.getNodeLabelExpression()); + ((RMContainerImpl) rmContainer).setQueueName(this.getQueueName()); + + updateAMContainerDiagnostics(AMState.ASSIGNED, null); + + // Add it to allContainers list. + /* + newlyAllocatedContainers.add(rmContainer); + + ContainerId containerId = container.getId(); + liveContainers.put(containerId, rmContainer); + */ + + // Update consumption and track allocations + /* + List resourceRequestList = appSchedulingInfo.allocate( + type, node, schedulerKey, request, container); + */ + + /* + attemptResourceUsage.incUsed(node.getPartition(), container.getResource()); + */ + + // Update resource requests related to "request" and store in RMContainer + + // Inform the container + /* + rmContainer.handle(new RMContainerEvent(containerId, RMContainerEventType.START)); + + + if (LOG.isDebugEnabled()) { + LOG.debug("allocate: applicationAttemptId=" + containerId.getApplicationAttemptId() + + " container=" + containerId + " host=" + container.getNodeId().getHost() + " type=" + type); + } + RMAuditLogger.logSuccess(getUser(), AuditConstants.ALLOC_CONTAINER, + "SchedulerApp", getApplicationId(), containerId, container.getResource()); + */ + + return rmContainer; + } finally { + readLock.unlock(); } - - // Required sanity check - AM can call 'allocate' to update resource - // request without locking the scheduler, hence we need to check - if (getTotalRequiredResources(schedulerKey) <= 0) { - return null; + } + + private boolean rmContainerInFinalState(RMContainer rmContainer) { + if (null == rmContainer) { + return false; } - // Create RMContainer - RMContainer rmContainer = - new RMContainerImpl(container, this.getApplicationAttemptId(), - node.getNodeID(), appSchedulingInfo.getUser(), this.rmContext, - request.getNodeLabelExpression()); - ((RMContainerImpl)rmContainer).setQueueName(this.getQueueName()); + return rmContainer.getFinishedStatus() != null; + } - updateAMContainerDiagnostics(AMState.ASSIGNED, null); + private boolean anyContainerInFinalState( + ResourceCommitRequest request) { + for (SchedulerContainer c : request + .getContainersToRelease()) { + if (rmContainerInFinalState(c.getRmContainer())) { + return true; + } + } - // Add it to allContainers list. - newlyAllocatedContainers.add(rmContainer); + for (ContainerAllocationContext c : request + .getContainersToAllocate()) { + for (SchedulerContainer r : c + .getToRelease()) { + if (rmContainerInFinalState(r.getRmContainer())) { + return true; + } + } - ContainerId containerId = container.getId(); - liveContainers.put(containerId, rmContainer); + if (null != c.getAllocateFromReservedContainer()) { + if (rmContainerInFinalState( + c.getAllocateFromReservedContainer().getRmContainer())) { + return true; + } + } + } - // Update consumption and track allocations - List resourceRequestList = appSchedulingInfo.allocate( - type, node, schedulerKey, request, container); + for (ContainerAllocationContext c : request + .getContainersToReserve()) { + for (SchedulerContainer r : c + .getToRelease()) { + if (rmContainerInFinalState(r.getRmContainer())) { + return true; + } + } + } - attemptResourceUsage.incUsed(node.getPartition(), container.getResource()); + return false; + } - // Update resource requests related to "request" and store in RMContainer - ((RMContainerImpl)rmContainer).setResourceRequests(resourceRequestList); + public boolean acceptResourceCommitRequest(Resource cluster, + ResourceCommitRequest request) { + boolean accepted = true; + List resourceRequests = null; - // Inform the container - rmContainer.handle( - new RMContainerEvent(containerId, RMContainerEventType.START)); + try { + readLock.lock(); - if (LOG.isDebugEnabled()) { - LOG.debug("allocate: applicationAttemptId=" - + containerId.getApplicationAttemptId() - + " container=" + containerId + " host=" - + container.getNodeId().getHost() + " type=" + type); + // First make sure no container in release list in final state + if (anyContainerInFinalState(request)) { + return false; + } + + // TODO, make sure all scheduler nodes are existed + // TODO, make sure all node labels are not changed + + if (request.anythingAllocatedOrReserved()) { + /* + * 1) If this is a newly allocated container, check if the node is reserved + * / not-reserved by any other application + * 2) If this is a newly reserved container, check if the node is reserved or not + */ + // Assume we have only one container allocated or reserved + ContainerAllocationContext + allocation = request.getFirstAllocatedOrReservedContainer(); + SchedulerContainer + schedulerContainer = allocation.getAllocatedOrReservedContainer(); + + if (schedulerContainer.isAllocated()) { + // When allocate a new container + resourceRequests = + schedulerContainer.getRmContainer().getResourceRequests(); + + // Check pending resource request + if (!appSchedulingInfo.checkAllocation(allocation.getNodeType(), + schedulerContainer.getSchedulerNode(), + schedulerContainer.getSchedulerRequestKey())) { + return false; + } + + // Make sure node is not reserved by anyone else + RMContainer reservedContainerOnNode = + schedulerContainer.getSchedulerNode().getReservedContainer(); + if (reservedContainerOnNode != null) { + RMContainer fromReservedContainer = null; + if (allocation.getAllocatedOrReservedContainer() != null) { + fromReservedContainer = + allocation.getAllocateFromReservedContainer() + .getRmContainer(); + } + + if (fromReservedContainer != reservedContainerOnNode) { + accepted = false; + } + } + + // Do we have enough space on this node? + Resource availableResource = Resources.clone( + schedulerContainer.getSchedulerNode().getUnallocatedResource()); + if (allocation.getAllocateFromReservedContainer() != null) { + Resources.addTo(availableResource, + allocation.getAllocateFromReservedContainer() + .getRmContainer().getReservedResource()); + } + if (!Resources.fitsIn(rc, cluster, + allocation.getAllocatedOrReservedResource(), + availableResource)) { + accepted = false; + } + } else { + // When reserve a new container + // Just check if the node is not already reserved by someone + if (schedulerContainer.getSchedulerNode().getReservedContainer() + != null) { + accepted = false; + } + } + } + } finally { + readLock.unlock(); } - RMAuditLogger.logSuccess(getUser(), - AuditConstants.ALLOC_CONTAINER, "SchedulerApp", - getApplicationId(), containerId, container.getResource()); - - return rmContainer; + + if (accepted) { + // Check parent + accepted = getCSLeafQueue().acceptCSAssignment(cluster, request); + } + + // When rejected, recover resource requests for this app + if (!accepted && resourceRequests != null) { + recoverResourceRequestsForContainer(resourceRequests); + } + + return accepted; + } + + public void applyResourceCommitRequest(Resource cluster, + ResourceCommitRequest request) { + try { + writeLock.lock(); + + // If we allocated something + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationContext + allocation = request.getFirstAllocatedOrReservedContainer(); + SchedulerContainer + schedulerContainer = allocation.getAllocatedOrReservedContainer(); + + // This allocation is from a reserved container + if (allocation.getAllocateFromReservedContainer() != null) { + RMContainer reservedContainer = + allocation.getAllocateFromReservedContainer().getRmContainer(); + // Handling container allocation + // Did we previously reserve containers at this 'priority'? + unreserve(schedulerContainer.getSchedulerRequestKey(), + schedulerContainer.getSchedulerNode(), reservedContainer); + } + + RMContainer rmContainer = schedulerContainer.getRmContainer(); + rmContainer.getContainer().setId(BuilderUtils + .newContainerId(getApplicationAttemptId(), getNewContainerId())); + ContainerId containerId = + rmContainer.getContainerId(); + + if (schedulerContainer.isAllocated()) { + // Allocate a new container + newlyAllocatedContainers.add(rmContainer); + liveContainers.put(containerId, rmContainer); + + // Deduct pending resource requests + List requests = appSchedulingInfo.allocate( + allocation.getNodeType(), + schedulerContainer.getSchedulerNode(), + schedulerContainer.getSchedulerRequestKey(), + schedulerContainer.getRmContainer().getContainer()); + ((RMContainerImpl)rmContainer).setResourceRequests(requests); + + attemptResourceUsage.incUsed( + schedulerContainer.getNodePartition(), + allocation.getAllocatedOrReservedResource()); + + rmContainer.handle( + new RMContainerEvent(containerId, RMContainerEventType.START)); + + // Inform the node + schedulerContainer.getSchedulerNode().allocateContainer(rmContainer); + + // update locality statistics, TODO fix this to update request locality map + /* + incNumAllocatedContainers(schedulerContainer.getNodeType(), + allocationResult.requestNodeType); + */ + + if (LOG.isDebugEnabled()) { + LOG.debug("allocate: applicationAttemptId=" + containerId + .getApplicationAttemptId() + " container=" + containerId + + " host=" + rmContainer.getAllocatedNode().getHost() + " type=" + + allocation.getNodeType()); + } + RMAuditLogger.logSuccess(getUser(), AuditConstants.ALLOC_CONTAINER, + "SchedulerApp", getApplicationId(), containerId, + allocation.getAllocatedOrReservedResource()); + } else { + reserve(schedulerContainer.getSchedulerRequestKey(), + schedulerContainer.getSchedulerNode(), + schedulerContainer.getRmContainer(), + schedulerContainer.getRmContainer().getContainer()); + } + } + } finally { + writeLock.unlock(); + } + + getCSLeafQueue().applyResourceCommitRequest(cluster, request); } public synchronized boolean unreserve(SchedulerRequestKey schedulerKey, FiCaSchedulerNode node, RMContainer rmContainer) { - // Cancel increase request (if it has reserved increase request - rmContainer.cancelIncreaseReservation(); - - // Done with the reservation? - if (internalUnreserve(node, schedulerKey)) { - node.unreserveResource(this); - - // Update reserved metrics - queue.getMetrics().unreserveResource(getUser(), - rmContainer.getReservedResource()); - queue.decReservedResource(node.getPartition(), - rmContainer.getReservedResource()); - return true; + try { + writeLock.lock(); + // Cancel increase request (if it has reserved increase request + rmContainer.cancelIncreaseReservation(); + + // Done with the reservation? + if (internalUnreserve(node, schedulerKey)) { + node.unreserveResource(this); + + // Update reserved metrics + queue.getMetrics().unreserveResource(getUser(), + rmContainer.getReservedResource()); + queue.decReservedResource(node.getPartition(), + rmContainer.getReservedResource()); + return true; + } + return false; + } finally { + writeLock.unlock(); } - return false; } private boolean internalUnreserve(FiCaSchedulerNode node, @@ -301,33 +543,15 @@ private boolean internalUnreserve(FiCaSchedulerNode node, return false; } - public synchronized float getLocalityWaitFactor( - SchedulerRequestKey schedulerKey, int clusterNodes) { - // Estimate: Required unique resources (i.e. hosts + racks) - int requiredResources = - Math.max(this.getResourceRequests(schedulerKey).size() - 1, 0); - - // waitFactor can't be more than '1' - // i.e. no point skipping more than clustersize opportunities - return Math.min(((float)requiredResources / clusterNodes), 1.0f); - } - - public synchronized Resource getTotalPendingRequests() { - Resource ret = Resource.newInstance(0, 0); - for (ResourceRequest rr : appSchedulingInfo.getAllResourceRequests()) { - // to avoid double counting we count only "ANY" resource requests - if (ResourceRequest.isAnyLocation(rr.getResourceName())){ - Resources.addTo(ret, - Resources.multiply(rr.getCapability(), rr.getNumContainers())); + public void markContainerForPreemption(ContainerId cont) { + try { + writeLock.lock(); + // ignore already completed containers + if (liveContainers.containsKey(cont)) { + containersToPreempt.add(cont); } - } - return ret; - } - - public synchronized void markContainerForPreemption(ContainerId cont) { - // ignore already completed containers - if (liveContainers.containsKey(cont)) { - containersToPreempt.add(cont); + } finally { + writeLock.unlock(); } } @@ -341,128 +565,153 @@ public synchronized void markContainerForPreemption(ContainerId cont) { * @param minimumAllocation * @return an allocation */ - public synchronized Allocation getAllocation(ResourceCalculator rc, + public Allocation getAllocation(ResourceCalculator rc, Resource clusterResource, Resource minimumAllocation) { - - Set currentContPreemption = Collections.unmodifiableSet( - new HashSet(containersToPreempt)); - containersToPreempt.clear(); - Resource tot = Resource.newInstance(0, 0); - for(ContainerId c : currentContPreemption){ - Resources.addTo(tot, - liveContainers.get(c).getContainer().getResource()); + try { + writeLock.lock(); + + Set currentContPreemption = Collections.unmodifiableSet(new HashSet(containersToPreempt)); + containersToPreempt.clear(); + Resource tot = Resource.newInstance(0, 0); + for (ContainerId c : currentContPreemption) { + Resources.addTo(tot, liveContainers.get(c).getContainer().getResource()); + } + int numCont = (int) Math.ceil( + Resources.divide(rc, clusterResource, tot, minimumAllocation)); + ResourceRequest rr = ResourceRequest.newInstance(Priority.UNDEFINED, + ResourceRequest.ANY, minimumAllocation, numCont); + List newlyAllocatedContainers = pullNewlyAllocatedContainers(); + List newlyIncreasedContainers = pullNewlyIncreasedContainers(); + List newlyDecreasedContainers = pullNewlyDecreasedContainers(); + List updatedNMTokens = pullUpdatedNMTokens(); + Resource headroom = getHeadroom(); + setApplicationHeadroomForMetrics(headroom); + return new Allocation(newlyAllocatedContainers, headroom, null, + currentContPreemption, Collections.singletonList(rr), updatedNMTokens, + newlyIncreasedContainers, newlyDecreasedContainers); + } finally { + writeLock.unlock(); } - int numCont = (int) Math.ceil( - Resources.divide(rc, clusterResource, tot, minimumAllocation)); - ResourceRequest rr = ResourceRequest.newInstance( - Priority.UNDEFINED, ResourceRequest.ANY, - minimumAllocation, numCont); - List newlyAllocatedContainers = pullNewlyAllocatedContainers(); - List newlyIncreasedContainers = pullNewlyIncreasedContainers(); - List newlyDecreasedContainers = pullNewlyDecreasedContainers(); - List updatedNMTokens = pullUpdatedNMTokens(); - Resource headroom = getHeadroom(); - setApplicationHeadroomForMetrics(headroom); - return new Allocation(newlyAllocatedContainers, headroom, null, - currentContPreemption, Collections.singletonList(rr), updatedNMTokens, - newlyIncreasedContainers, newlyDecreasedContainers); } - synchronized public NodeId getNodeIdToUnreserve( + public NodeId getNodeIdToUnreserve( SchedulerRequestKey schedulerKey, Resource resourceNeedUnreserve, ResourceCalculator rc, Resource clusterResource) { + try { + readLock.lock(); + // first go around make this algorithm simple and just grab first + // reservation that has enough resources + Map reservedContainers = this.reservedContainers.get( + schedulerKey); + + if ((reservedContainers != null) && (!reservedContainers.isEmpty())) { + for (Map.Entry entry : reservedContainers.entrySet()) { + NodeId nodeId = entry.getKey(); + RMContainer reservedContainer = entry.getValue(); + if (reservedContainer.hasIncreaseReservation()) { + // Currently, only regular container allocation supports continuous + // reservation looking, we don't support canceling increase request + // reservation when allocating regular container. + continue; + } - // first go around make this algorithm simple and just grab first - // reservation that has enough resources - Map reservedContainers = this.reservedContainers - .get(schedulerKey); - - if ((reservedContainers != null) && (!reservedContainers.isEmpty())) { - for (Map.Entry entry : reservedContainers.entrySet()) { - NodeId nodeId = entry.getKey(); - RMContainer reservedContainer = entry.getValue(); - if (reservedContainer.hasIncreaseReservation()) { - // Currently, only regular container allocation supports continuous - // reservation looking, we don't support canceling increase request - // reservation when allocating regular container. - continue; - } - - Resource reservedResource = reservedContainer.getReservedResource(); - - // make sure we unreserve one with at least the same amount of - // resources, otherwise could affect capacity limits - if (Resources.fitsIn(rc, clusterResource, resourceNeedUnreserve, - reservedResource)) { - if (LOG.isDebugEnabled()) { - LOG.debug("unreserving node with reservation size: " - + reservedResource - + " in order to allocate container with size: " + resourceNeedUnreserve); + Resource reservedResource = reservedContainer.getReservedResource(); + + // make sure we unreserve one with at least the same amount of + // resources, otherwise could affect capacity limits + if (Resources.fitsIn(rc, clusterResource, resourceNeedUnreserve, + reservedResource)) { + if (LOG.isDebugEnabled()) { + LOG.debug("unreserving node with reservation size: " + reservedResource + + " in order to allocate container with size: " + resourceNeedUnreserve); + } + return nodeId; } - return nodeId; } } + return null; + } finally { + readLock.unlock(); } - return null; } - public synchronized void setHeadroomProvider( + public void setHeadroomProvider( CapacityHeadroomProvider headroomProvider) { this.headroomProvider = headroomProvider; } - public synchronized CapacityHeadroomProvider getHeadroomProvider() { + public CapacityHeadroomProvider getHeadroomProvider() { return headroomProvider; } @Override - public synchronized Resource getHeadroom() { - if (headroomProvider != null) { - return headroomProvider.getHeadroom(); + public Resource getHeadroom() { + try { + readLock.lock(); + if (headroomProvider != null) { + return headroomProvider.getHeadroom(); + } + return super.getHeadroom(); + } finally { + readLock.unlock(); } - return super.getHeadroom(); } @Override - public synchronized void transferStateFromPreviousAttempt( + public void transferStateFromPreviousAttempt( SchedulerApplicationAttempt appAttempt) { - super.transferStateFromPreviousAttempt(appAttempt); - this.headroomProvider = - ((FiCaSchedulerApp) appAttempt).getHeadroomProvider(); + try { + writeLock.lock(); + super.transferStateFromPreviousAttempt(appAttempt); + this.headroomProvider = + ((FiCaSchedulerApp) appAttempt).getHeadroomProvider(); + } + finally { + writeLock.unlock(); + } } public boolean reserveIncreasedContainer(SchedulerRequestKey schedulerKey, FiCaSchedulerNode node, RMContainer rmContainer, Resource reservedResource) { - // Inform the application - if (super.reserveIncreasedContainer(node, schedulerKey, rmContainer, - reservedResource)) { + try { + writeLock.lock(); + // Inform the application + if (super.reserveIncreasedContainer(node, schedulerKey, rmContainer, + reservedResource)) { - queue.getMetrics().reserveResource(getUser(), reservedResource); + queue.getMetrics().reserveResource(getUser(), reservedResource); - // Update the node - node.reserveResource(this, schedulerKey, rmContainer); - - // Succeeded - return true; + // Update the node + node.reserveResource(this, schedulerKey, rmContainer); + + // Succeeded + return true; + } + + return false; + } finally { + writeLock.unlock(); } - - return false; } public void reserve(SchedulerRequestKey schedulerKey, FiCaSchedulerNode node, RMContainer rmContainer, Container container) { - // Update reserved metrics if this is the first reservation - if (rmContainer == null) { - queue.getMetrics().reserveResource( - getUser(), container.getResource()); - } + try { + writeLock.lock(); + // Update reserved metrics if this is the first reservation + if (rmContainer == null) { + queue.getMetrics().reserveResource(getUser(), container.getResource()); + } - // Inform the application - rmContainer = super.reserve(node, schedulerKey, rmContainer, container); + // Inform the application + rmContainer = super.reserve(node, schedulerKey, rmContainer, container); - // Update the node - node.reserveResource(this, schedulerKey, rmContainer); + // Update the node + node.reserveResource(this, schedulerKey, rmContainer); + } finally { + writeLock.unlock(); + } } @VisibleForTesting @@ -505,7 +754,7 @@ public LeafQueue getCSLeafQueue() { } public CSAssignment assignContainers(Resource clusterResource, - FiCaSchedulerNode node, ResourceLimits currentResourceLimits, + PlacementSet placementSet, ResourceLimits currentResourceLimits, SchedulingMode schedulingMode, RMContainer reservedContainer) { if (LOG.isDebugEnabled()) { LOG.debug("pre-assignContainers for application " @@ -513,27 +762,35 @@ public CSAssignment assignContainers(Resource clusterResource, showRequests(); } - synchronized (this) { - return containerAllocator.assignContainers(clusterResource, node, + try { + readLock.lock(); + return containerAllocator.assignContainers(clusterResource, placementSet, schedulingMode, currentResourceLimits, reservedContainer); + } finally { + readLock.unlock(); } } public void nodePartitionUpdated(RMContainer rmContainer, String oldPartition, String newPartition) { - Resource containerResource = rmContainer.getAllocatedResource(); - this.attemptResourceUsage.decUsed(oldPartition, containerResource); - this.attemptResourceUsage.incUsed(newPartition, containerResource); - getCSLeafQueue().decUsedResource(oldPartition, containerResource, this); - getCSLeafQueue().incUsedResource(newPartition, containerResource, this); - - // Update new partition name if container is AM and also update AM resource - if (rmContainer.isAMContainer()) { - setAppAMNodePartitionName(newPartition); - this.attemptResourceUsage.decAMUsed(oldPartition, containerResource); - this.attemptResourceUsage.incAMUsed(newPartition, containerResource); - getCSLeafQueue().decAMUsedResource(oldPartition, containerResource, this); - getCSLeafQueue().incAMUsedResource(newPartition, containerResource, this); + try { + writeLock.lock(); + Resource containerResource = rmContainer.getAllocatedResource(); + this.attemptResourceUsage.decUsed(oldPartition, containerResource); + this.attemptResourceUsage.incUsed(newPartition, containerResource); + getCSLeafQueue().decUsedResource(oldPartition, containerResource, this); + getCSLeafQueue().incUsedResource(newPartition, containerResource, this); + + // Update new partition name if container is AM and also update AM resource + if (rmContainer.isAMContainer()) { + setAppAMNodePartitionName(newPartition); + this.attemptResourceUsage.decAMUsed(oldPartition, containerResource); + this.attemptResourceUsage.incAMUsed(newPartition, containerResource); + getCSLeafQueue().decAMUsedResource(oldPartition, containerResource, this); + getCSLeafQueue().incAMUsedResource(newPartition, containerResource, this); + } + } finally { + writeLock.unlock(); } } @@ -597,7 +854,8 @@ public void updateAppSkipNodeDiagnostics(String message) { this.appSkipNodeDiagnostics = message; } - public void updateNodeInfoForAMDiagnostics(FiCaSchedulerNode node) { + public void updateNodeInfoForAMDiagnostics( + PlacementSet candidates) { if (isWaitingForAMContainer()) { StringBuilder diagnosticMessageBldr = new StringBuilder(); if (appSkipNodeDiagnostics != null) { @@ -606,15 +864,26 @@ public void updateNodeInfoForAMDiagnostics(FiCaSchedulerNode node) { } diagnosticMessageBldr.append( CSAMContainerLaunchDiagnosticsConstants.LAST_NODE_PROCESSED_MSG); - diagnosticMessageBldr.append(node.getNodeID()); - diagnosticMessageBldr.append(" ( Partition : "); - diagnosticMessageBldr.append(node.getLabels()); - diagnosticMessageBldr.append(", Total resource : "); - diagnosticMessageBldr.append(node.getTotalResource()); - diagnosticMessageBldr.append(", Available resource : "); - diagnosticMessageBldr.append(node.getUnallocatedResource()); - diagnosticMessageBldr.append(" )."); + + SchedulerNode node = candidates.getNextAvailable(); + + // TODO, fix this when global scheduling enabled. + if (null != node) { + diagnosticMessageBldr.append(node.getNodeID()); + diagnosticMessageBldr.append(" ( Partition : "); + diagnosticMessageBldr.append(node.getLabels()); + diagnosticMessageBldr.append(", Total resource : "); + diagnosticMessageBldr.append(node.getTotalResource()); + diagnosticMessageBldr.append(", Available resource : "); + diagnosticMessageBldr.append(node.getUnallocatedResource()); + diagnosticMessageBldr.append(" )."); + } + updateAMContainerDiagnostics(AMState.ACTIVATED, diagnosticMessageBldr.toString()); } } + + public CapacitySchedulerContext getCapacitySchedulerContext() { + return capacitySchedulerContext; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/AbstractSchedulerNodesScorer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/AbstractSchedulerNodesScorer.java new file mode 100644 index 0000000..f8af71f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/AbstractSchedulerNodesScorer.java @@ -0,0 +1,24 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer; + +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; + +import java.util.concurrent.locks.ReentrantReadWriteLock; + +public abstract class AbstractSchedulerNodesScorer + implements SchedulerNodesScorer { + SchedulerApplicationAttempt attempt; + SchedulerRequestKey schedulerKey; + ReentrantReadWriteLock.ReadLock readLock; + ReentrantReadWriteLock.WriteLock writeLock; + + AbstractSchedulerNodesScorer(SchedulerApplicationAttempt attempt, + SchedulerRequestKey schedulerKey) { + this.attempt = attempt; + this.schedulerKey = schedulerKey; + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + readLock = lock.readLock(); + writeLock = lock.writeLock(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/DoNotCareNodesScorer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/DoNotCareNodesScorer.java new file mode 100644 index 0000000..8074814 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/DoNotCareNodesScorer.java @@ -0,0 +1,15 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer; + +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; + +import java.util.Iterator; + +public class DoNotCareNodesScorer + implements SchedulerNodesScorer { + @Override + public Iterator scorePlacementSet( + PlacementSet candidates) { + return candidates.getAllSchedulableNodes().values().iterator(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/LocalityNodesScorer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/LocalityNodesScorer.java new file mode 100644 index 0000000..61777a9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/LocalityNodesScorer.java @@ -0,0 +1,120 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer; + +import org.apache.commons.collections.IteratorUtils; +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; + +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.concurrent.ConcurrentLinkedQueue; + +public class LocalityNodesScorer + extends AbstractSchedulerNodesScorer { + private long lastInitializedTime = 0; + + private ConcurrentLinkedQueue nodeLocalHosts; + private ConcurrentLinkedQueue rackLocalHosts; + private ConcurrentLinkedQueue offswitchHosts; + + public LocalityNodesScorer(SchedulerApplicationAttempt attempt, + SchedulerRequestKey schedulerKey) { + super(attempt, schedulerKey); + } + + private void reinitializeIfNeeded(PlacementSet candidates) { + // Do not reinitialize in 5000 ms. + // FIXME: this should be configurable and will be forced to update when + // Requirement changes, etc. + if (System.currentTimeMillis() - 5000L < lastInitializedTime) { + return; + } + + lastInitializedTime = System.currentTimeMillis(); + + try { + writeLock.lock(); + if (null == nodeLocalHosts) { + nodeLocalHosts = new ConcurrentLinkedQueue<>(); + rackLocalHosts = new ConcurrentLinkedQueue<>(); + offswitchHosts = new ConcurrentLinkedQueue<>(); + } else { + nodeLocalHosts.clear(); + rackLocalHosts.clear(); + offswitchHosts.clear(); + } + + // We don't need any resource + boolean needResource = attempt.getResourceRequest(schedulerKey, + ResourceRequest.ANY).getNumContainers() > 0; + if (!needResource) { + return; + } + + for (Map.Entry entry : candidates.getAllSchedulableNodes() + .entrySet()) { + NodeId nodeId = entry.getKey(); + N node = entry.getValue(); + String rack = node.getRackName(); + + ResourceRequest rr = attempt.getAppSchedulingInfo().getResourceRequest( + schedulerKey, nodeId.getHost()); + if (rr != null && rr.getNumContainers() > 0) { + nodeLocalHosts.add(node); + } else { + rr = attempt.getAppSchedulingInfo().getResourceRequest(schedulerKey, + rack); + boolean hasRackLocalRequest = rr != null && rr.getNumContainers() > 0; + if (hasRackLocalRequest) { + rackLocalHosts.add(node); + } else { + offswitchHosts.add(node); + } + } + } + } finally { + writeLock.unlock(); + } + } + + private void moveFirstToLast(ConcurrentLinkedQueue queue) { + N n = null; + try { + n = queue.poll(); + } catch (NoSuchElementException e) { + // do nothing; + } + + if (n != null) { + queue.offer(n); + } + } + + @Override + public Iterator scorePlacementSet( + PlacementSet candidates) { + reinitializeIfNeeded(candidates); + + try { + writeLock.lock(); + moveFirstToLast(nodeLocalHosts); + moveFirstToLast(rackLocalHosts); + moveFirstToLast(offswitchHosts); + } finally { + writeLock.unlock(); + } + + try { + readLock.lock(); + return IteratorUtils.chainedIterator( + new Iterator[] { nodeLocalHosts.iterator(), rackLocalHosts.iterator(), + offswitchHosts.iterator() }); + } finally { + readLock.unlock(); + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorer.java new file mode 100644 index 0000000..8a89e40 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorer.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer; + +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.PlacementSet; + +import java.util.Iterator; + +public interface SchedulerNodesScorer { + /** + * Score nodes according to given placement set. + * @param placementSet + * @return sorted nodes based on goodness + */ + Iterator scorePlacementSet(PlacementSet placementSet); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorerCache.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorerCache.java new file mode 100644 index 0000000..41f222c --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorerCache.java @@ -0,0 +1,70 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer; + +import org.apache.commons.collections.map.LRUMap; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; + +import java.util.Map; + +/** + * Do necessary caching for scorer according to type and applications + */ +public class SchedulerNodesScorerCache { + // At most store 10K objects + private static LRUMap lruCache = new LRUMap(1024 * 10); + + private static SchedulerNodesScorerType getSchedulerNodesScorerType( + SchedulerApplicationAttempt attempt, SchedulerRequestKey schedulerKey) { + Map requests = attempt.getResourceRequests( + schedulerKey); + + // Simplest rule to determine with nodes scorer will be used: + // When requested #resourceName > 0, use locality, otherwise use DO_NOT_CARE + if (requests != null && requests.size() > 1) { + return SchedulerNodesScorerType.LOCALITY; + } + + return SchedulerNodesScorerType.ANY; + } + + public static SchedulerNodesScorer getOrCreateScorer( + SchedulerApplicationAttempt attempt, SchedulerRequestKey schedulerKey) { + SchedulerNodesScorerType type = getSchedulerNodesScorerType(attempt, + schedulerKey); + + return getOrCreateScorer(attempt, schedulerKey, type); + } + + public static SchedulerNodesScorer getOrCreateScorer( + SchedulerApplicationAttempt attempt, SchedulerRequestKey schedulerKey, + SchedulerNodesScorerType type) { + String key = attempt.getApplicationAttemptId().toString() + schedulerKey + .getPriority().toString(); + SchedulerNodesScorer scorer; + // scorer = (SchedulerNodesScorer) lruCache.get(key); + // FIXME: need to correctly compare if we need to update + scorer = null; + + if (null == scorer) { + // FIXME, for simple, create scorer every time. We can cache scorer + // without any issue + switch (type) { + case LOCALITY: + scorer = new LocalityNodesScorer<>(attempt, schedulerKey); + break; + case ANY: + scorer = new DoNotCareNodesScorer<>(); + break; + default: + return null; + } + + lruCache.put(key, scorer); + } + + return scorer; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorerType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorerType.java new file mode 100644 index 0000000..db9c6fc --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/scorer/SchedulerNodesScorerType.java @@ -0,0 +1,6 @@ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.scorer; + +public enum SchedulerNodesScorerType { + ANY, // Any nodes is fine + LOCALITY, // Locality-based +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index 9e5a807..dba254b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -123,65 +123,70 @@ public QueueMetrics getMetrics() { return queue.getMetrics(); } - synchronized public void containerCompleted(RMContainer rmContainer, + public void containerCompleted(RMContainer rmContainer, ContainerStatus containerStatus, RMContainerEventType event) { - - Container container = rmContainer.getContainer(); - ContainerId containerId = container.getId(); - - // Remove from the list of newly allocated containers if found - newlyAllocatedContainers.remove(rmContainer); - - // Inform the container - rmContainer.handle( - new RMContainerFinishedEvent( - containerId, - containerStatus, - event) - ); - if (LOG.isDebugEnabled()) { - LOG.debug("Completed container: " + rmContainer.getContainerId() + - " in state: " + rmContainer.getState() + " event:" + event); - } + try { + writeLock.lock(); + + Container container = rmContainer.getContainer(); + ContainerId containerId = container.getId(); + + // Remove from the list of newly allocated containers if found + newlyAllocatedContainers.remove(rmContainer); + + // Inform the container + rmContainer.handle( + new RMContainerFinishedEvent(containerId, containerStatus, event)); + if (LOG.isDebugEnabled()) { + LOG.debug("Completed container: " + rmContainer.getContainerId() + + " in state: " + rmContainer.getState() + " event:" + event); + } - // Remove from the list of containers - liveContainers.remove(rmContainer.getContainerId()); + // Remove from the list of containers + liveContainers.remove(rmContainer.getContainerId()); - Resource containerResource = rmContainer.getContainer().getResource(); - RMAuditLogger.logSuccess(getUser(), - AuditConstants.RELEASE_CONTAINER, "SchedulerApp", - getApplicationId(), containerId, containerResource); - - // Update usage metrics - queue.getMetrics().releaseResources(getUser(), 1, containerResource); - this.attemptResourceUsage.decUsed(containerResource); + Resource containerResource = rmContainer.getContainer().getResource(); + RMAuditLogger.logSuccess(getUser(), AuditConstants.RELEASE_CONTAINER, + "SchedulerApp", getApplicationId(), containerId, containerResource); - // remove from preemption map if it is completed - preemptionMap.remove(rmContainer); + // Update usage metrics + queue.getMetrics().releaseResources(getUser(), 1, containerResource); + this.attemptResourceUsage.decUsed(containerResource); - // Clear resource utilization metrics cache. - lastMemoryAggregateAllocationUpdateTime = -1; + // remove from preemption map if it is completed + preemptionMap.remove(rmContainer); + + // Clear resource utilization metrics cache. + lastResAllocationUpdateTime = -1; + } finally { + writeLock.unlock(); + } } - private synchronized void unreserveInternal( + private void unreserveInternal( SchedulerRequestKey schedulerKey, FSSchedulerNode node) { - Map reservedContainers = - this.reservedContainers.get(schedulerKey); - RMContainer reservedContainer = reservedContainers.remove(node.getNodeID()); - if (reservedContainers.isEmpty()) { - this.reservedContainers.remove(schedulerKey); - } - - // Reset the re-reservation count - resetReReservations(schedulerKey); + try { + writeLock.lock(); + Map reservedContainers = this.reservedContainers.get( + schedulerKey); + RMContainer reservedContainer = reservedContainers.remove(node.getNodeID()); + if (reservedContainers.isEmpty()) { + this.reservedContainers.remove(schedulerKey); + } + + // Reset the re-reservation count + resetReReservations(schedulerKey); - Resource resource = reservedContainer.getContainer().getResource(); - this.attemptResourceUsage.decReserved(resource); + Resource resource = reservedContainer.getContainer().getResource(); + this.attemptResourceUsage.decReserved(resource); - LOG.info("Application " + getApplicationId() + " unreserved " + " on node " - + node + ", currently has " + reservedContainers.size() - + " at priority " + schedulerKey.getPriority() + "; currentReservation " - + this.attemptResourceUsage.getReserved()); + LOG.info( + "Application " + getApplicationId() + " unreserved " + " on node " + node + ", currently has " + reservedContainers.size() + + " at priority " + schedulerKey.getPriority() + "; currentReservation " + + this.attemptResourceUsage.getReserved()); + } finally { + writeLock.unlock(); + } } private void subtractResourcesOnBlacklistedNodes( @@ -239,17 +244,6 @@ public Resource getHeadroom() { return headroom; } - public synchronized float getLocalityWaitFactor( - SchedulerRequestKey schedulerKey, int clusterNodes) { - // Estimate: Required unique resources (i.e. hosts + racks) - int requiredResources = - Math.max(this.getResourceRequests(schedulerKey).size() - 1, 0); - - // waitFactor can't be more than '1' - // i.e. no point skipping more than clustersize opportunities - return Math.min(((float)requiredResources / clusterNodes), 1.0f); - } - /** * Return the level at which we are allowed to schedule containers, given the * current size of the cluster and thresholds indicating how many nodes to @@ -261,44 +255,54 @@ public synchronized float getLocalityWaitFactor( * @param rackLocalityThreshold rackLocalityThreshold * @return NodeType */ - public synchronized NodeType getAllowedLocalityLevel( + public NodeType getAllowedLocalityLevel( SchedulerRequestKey schedulerKey, int numNodes, double nodeLocalityThreshold, double rackLocalityThreshold) { - // upper limit on threshold - if (nodeLocalityThreshold > 1.0) { nodeLocalityThreshold = 1.0; } - if (rackLocalityThreshold > 1.0) { rackLocalityThreshold = 1.0; } + try { + writeLock.lock(); + // upper limit on threshold + if (nodeLocalityThreshold > 1.0) { + nodeLocalityThreshold = 1.0; + } + if (rackLocalityThreshold > 1.0) { + rackLocalityThreshold = 1.0; + } - // If delay scheduling is not being used, can schedule anywhere - if (nodeLocalityThreshold < 0.0 || rackLocalityThreshold < 0.0) { - return NodeType.OFF_SWITCH; - } + // If delay scheduling is not being used, can schedule anywhere + if (nodeLocalityThreshold < 0.0 || rackLocalityThreshold < 0.0) { + return NodeType.OFF_SWITCH; + } - // Default level is NODE_LOCAL - if (!allowedLocalityLevel.containsKey(schedulerKey)) { - allowedLocalityLevel.put(schedulerKey, NodeType.NODE_LOCAL); - return NodeType.NODE_LOCAL; - } + // Default level is NODE_LOCAL + if (!allowedLocalityLevel.containsKey(schedulerKey)) { + allowedLocalityLevel.put(schedulerKey, NodeType.NODE_LOCAL); + return NodeType.NODE_LOCAL; + } - NodeType allowed = allowedLocalityLevel.get(schedulerKey); + NodeType allowed = allowedLocalityLevel.get(schedulerKey); - // If level is already most liberal, we're done - if (allowed.equals(NodeType.OFF_SWITCH)) return NodeType.OFF_SWITCH; + // If level is already most liberal, we're done + if (allowed.equals(NodeType.OFF_SWITCH)) + return NodeType.OFF_SWITCH; - double threshold = allowed.equals(NodeType.NODE_LOCAL) ? nodeLocalityThreshold : - rackLocalityThreshold; + double threshold = allowed.equals(NodeType.NODE_LOCAL) ? + nodeLocalityThreshold : + rackLocalityThreshold; - // Relax locality constraints once we've surpassed threshold. - if (getSchedulingOpportunities(schedulerKey) > (numNodes * threshold)) { - if (allowed.equals(NodeType.NODE_LOCAL)) { - allowedLocalityLevel.put(schedulerKey, NodeType.RACK_LOCAL); - resetSchedulingOpportunities(schedulerKey); - } - else if (allowed.equals(NodeType.RACK_LOCAL)) { - allowedLocalityLevel.put(schedulerKey, NodeType.OFF_SWITCH); - resetSchedulingOpportunities(schedulerKey); + // Relax locality constraints once we've surpassed threshold. + if (getSchedulingOpportunities(schedulerKey) > (numNodes * threshold)) { + if (allowed.equals(NodeType.NODE_LOCAL)) { + allowedLocalityLevel.put(schedulerKey, NodeType.RACK_LOCAL); + resetSchedulingOpportunities(schedulerKey); + } else if (allowed.equals(NodeType.RACK_LOCAL)) { + allowedLocalityLevel.put(schedulerKey, NodeType.OFF_SWITCH); + resetSchedulingOpportunities(schedulerKey); + } } + return allowedLocalityLevel.get(schedulerKey); + } finally { + writeLock.unlock(); } - return allowedLocalityLevel.get(schedulerKey); } /** @@ -311,120 +315,122 @@ else if (allowed.equals(NodeType.RACK_LOCAL)) { * @param currentTimeMs currentTimeMs * @return NodeType */ - public synchronized NodeType getAllowedLocalityLevelByTime( + public NodeType getAllowedLocalityLevelByTime( SchedulerRequestKey schedulerKey, long nodeLocalityDelayMs, long rackLocalityDelayMs, long currentTimeMs) { + try { + writeLock.lock(); - // if not being used, can schedule anywhere - if (nodeLocalityDelayMs < 0 || rackLocalityDelayMs < 0) { - return NodeType.OFF_SWITCH; - } - - // default level is NODE_LOCAL - if (!allowedLocalityLevel.containsKey(schedulerKey)) { - // add the initial time of priority to prevent comparing with FsApp - // startTime and allowedLocalityLevel degrade - lastScheduledContainer.put(schedulerKey, currentTimeMs); - if (LOG.isDebugEnabled()) { - LOG.debug("Init the lastScheduledContainer time, priority: " - + schedulerKey.getPriority() + ", time: " + currentTimeMs); + // if not being used, can schedule anywhere + if (nodeLocalityDelayMs < 0 || rackLocalityDelayMs < 0) { + return NodeType.OFF_SWITCH; } - allowedLocalityLevel.put(schedulerKey, NodeType.NODE_LOCAL); - return NodeType.NODE_LOCAL; - } - NodeType allowed = allowedLocalityLevel.get(schedulerKey); + // default level is NODE_LOCAL + if (!allowedLocalityLevel.containsKey(schedulerKey)) { + // add the initial time of priority to prevent comparing with FsApp + // startTime and allowedLocalityLevel degrade + lastScheduledContainer.put(schedulerKey, currentTimeMs); + if (LOG.isDebugEnabled()) { + LOG.debug( + "Init the lastScheduledContainer time, priority: " + schedulerKey.getPriority() + ", time: " + currentTimeMs); + } + allowedLocalityLevel.put(schedulerKey, NodeType.NODE_LOCAL); + return NodeType.NODE_LOCAL; + } - // if level is already most liberal, we're done - if (allowed.equals(NodeType.OFF_SWITCH)) { - return NodeType.OFF_SWITCH; - } + NodeType allowed = allowedLocalityLevel.get(schedulerKey); - // check waiting time - long waitTime = currentTimeMs; - if (lastScheduledContainer.containsKey(schedulerKey)) { - waitTime -= lastScheduledContainer.get(schedulerKey); - } else { - waitTime -= getStartTime(); - } + // if level is already most liberal, we're done + if (allowed.equals(NodeType.OFF_SWITCH)) { + return NodeType.OFF_SWITCH; + } - long thresholdTime = allowed.equals(NodeType.NODE_LOCAL) ? - nodeLocalityDelayMs : rackLocalityDelayMs; + // check waiting time + long waitTime = currentTimeMs; + if (lastScheduledContainer.containsKey(schedulerKey)) { + waitTime -= lastScheduledContainer.get(schedulerKey); + } else { + waitTime -= getStartTime(); + } - if (waitTime > thresholdTime) { - if (allowed.equals(NodeType.NODE_LOCAL)) { - allowedLocalityLevel.put(schedulerKey, NodeType.RACK_LOCAL); - resetSchedulingOpportunities(schedulerKey, currentTimeMs); - } else if (allowed.equals(NodeType.RACK_LOCAL)) { - allowedLocalityLevel.put(schedulerKey, NodeType.OFF_SWITCH); - resetSchedulingOpportunities(schedulerKey, currentTimeMs); + long thresholdTime = allowed.equals(NodeType.NODE_LOCAL) ? + nodeLocalityDelayMs : + rackLocalityDelayMs; + + if (waitTime > thresholdTime) { + if (allowed.equals(NodeType.NODE_LOCAL)) { + allowedLocalityLevel.put(schedulerKey, NodeType.RACK_LOCAL); + resetSchedulingOpportunities(schedulerKey, currentTimeMs); + } else if (allowed.equals(NodeType.RACK_LOCAL)) { + allowedLocalityLevel.put(schedulerKey, NodeType.OFF_SWITCH); + resetSchedulingOpportunities(schedulerKey, currentTimeMs); + } } + return allowedLocalityLevel.get(schedulerKey); + } finally { + writeLock.unlock(); } - return allowedLocalityLevel.get(schedulerKey); } - synchronized public RMContainer allocate(NodeType type, FSSchedulerNode node, + public RMContainer allocate(NodeType type, FSSchedulerNode node, SchedulerRequestKey schedulerKey, ResourceRequest request, Container reservedContainer) { - // Update allowed locality level - NodeType allowed = allowedLocalityLevel.get(schedulerKey); - if (allowed != null) { - if (allowed.equals(NodeType.OFF_SWITCH) && - (type.equals(NodeType.NODE_LOCAL) || - type.equals(NodeType.RACK_LOCAL))) { - this.resetAllowedLocalityLevel(schedulerKey, type); - } - else if (allowed.equals(NodeType.RACK_LOCAL) && - type.equals(NodeType.NODE_LOCAL)) { - this.resetAllowedLocalityLevel(schedulerKey, type); + try { + writeLock.lock(); + // Update allowed locality level + NodeType allowed = allowedLocalityLevel.get(schedulerKey); + if (allowed != null) { + if (allowed.equals(NodeType.OFF_SWITCH) && (type.equals(NodeType.NODE_LOCAL) || type.equals(NodeType.RACK_LOCAL))) { + this.resetAllowedLocalityLevel(schedulerKey, type); + } else if (allowed.equals(NodeType.RACK_LOCAL) && type.equals(NodeType.NODE_LOCAL)) { + this.resetAllowedLocalityLevel(schedulerKey, type); + } } - } - // Required sanity check - AM can call 'allocate' to update resource - // request without locking the scheduler, hence we need to check - if (getTotalRequiredResources(schedulerKey) <= 0) { - return null; - } + // Required sanity check - AM can call 'allocate' to update resource + // request without locking the scheduler, hence we need to check + if (getTotalRequiredResources(schedulerKey) <= 0) { + return null; + } - Container container = reservedContainer; - if (container == null) { - container = - createContainer(node, request.getCapability(), schedulerKey); - } - - // Create RMContainer - RMContainer rmContainer = new RMContainerImpl(container, - getApplicationAttemptId(), node.getNodeID(), - appSchedulingInfo.getUser(), rmContext); - ((RMContainerImpl)rmContainer).setQueueName(this.getQueueName()); + Container container = reservedContainer; + if (container == null) { + container = createContainer(node, request.getCapability(), schedulerKey); + } - // Add it to allContainers list. - newlyAllocatedContainers.add(rmContainer); - liveContainers.put(container.getId(), rmContainer); + // Create RMContainer + RMContainer rmContainer = new RMContainerImpl(container, + getApplicationAttemptId(), node.getNodeID(), appSchedulingInfo.getUser(), rmContext); + ((RMContainerImpl) rmContainer).setQueueName(this.getQueueName()); - // Update consumption and track allocations - List resourceRequestList = appSchedulingInfo.allocate( - type, node, schedulerKey, request, container); - this.attemptResourceUsage.incUsed(container.getResource()); + // Add it to allContainers list. + newlyAllocatedContainers.add(rmContainer); + liveContainers.put(container.getId(), rmContainer); - // Update resource requests related to "request" and store in RMContainer - ((RMContainerImpl) rmContainer).setResourceRequests(resourceRequestList); + // Update consumption and track allocations + List resourceRequestList = appSchedulingInfo.allocate( + type, node, schedulerKey, request, container); + this.attemptResourceUsage.incUsed(container.getResource()); - // Inform the container - rmContainer.handle( - new RMContainerEvent(container.getId(), RMContainerEventType.START)); + // Update resource requests related to "request" and store in RMContainer + ((RMContainerImpl) rmContainer).setResourceRequests(resourceRequestList); - if (LOG.isDebugEnabled()) { - LOG.debug("allocate: applicationAttemptId=" - + container.getId().getApplicationAttemptId() - + " container=" + container.getId() + " host=" - + container.getNodeId().getHost() + " type=" + type); + // Inform the container + rmContainer.handle( + new RMContainerEvent(container.getId(), RMContainerEventType.START)); + + if (LOG.isDebugEnabled()) { + LOG.debug("allocate: applicationAttemptId=" + container.getId().getApplicationAttemptId() + + " container=" + container.getId() + " host=" + container.getNodeId().getHost() + " type=" + type); + } + RMAuditLogger.logSuccess(getUser(), AuditConstants.ALLOC_CONTAINER, + "SchedulerApp", getApplicationId(), container.getId(), container.getResource()); + + return rmContainer; + } finally { + writeLock.unlock(); } - RMAuditLogger.logSuccess(getUser(), - AuditConstants.ALLOC_CONTAINER, "SchedulerApp", - getApplicationId(), container.getId(), container.getResource()); - - return rmContainer; } /** @@ -434,12 +440,17 @@ else if (allowed.equals(NodeType.RACK_LOCAL) && * @param schedulerKey Scheduler Key * @param level NodeType */ - public synchronized void resetAllowedLocalityLevel( + public void resetAllowedLocalityLevel( SchedulerRequestKey schedulerKey, NodeType level) { - NodeType old = allowedLocalityLevel.get(schedulerKey); - LOG.info("Raising locality level from " + old + " to " + level + " at " + - " priority " + schedulerKey.getPriority()); - allowedLocalityLevel.put(schedulerKey, level); + try { + writeLock.lock(); + NodeType old = allowedLocalityLevel.get(schedulerKey); + LOG.info("Raising locality level from " + old + " to " + level + " at " + + " priority " + schedulerKey.getPriority()); + allowedLocalityLevel.put(schedulerKey, level); + } finally { + writeLock.unlock(); + } } // related methods @@ -584,21 +595,31 @@ public void unreserve(SchedulerRequestKey schedulerKey, getUser(), rmContainer.getContainer().getResource()); } - private synchronized void setReservation(SchedulerNode node) { - String rackName = node.getRackName() == null ? "NULL" : node.getRackName(); - Set rackReservations = reservations.get(rackName); - if (rackReservations == null) { - rackReservations = new HashSet<>(); - reservations.put(rackName, rackReservations); + private void setReservation(SchedulerNode node) { + try { + writeLock.lock(); + String rackName = node.getRackName() == null ? "NULL" : node.getRackName(); + Set rackReservations = reservations.get(rackName); + if (rackReservations == null) { + rackReservations = new HashSet<>(); + reservations.put(rackName, rackReservations); + } + rackReservations.add(node.getNodeName()); + } finally { + writeLock.unlock(); } - rackReservations.add(node.getNodeName()); } - private synchronized void clearReservation(SchedulerNode node) { - String rackName = node.getRackName() == null ? "NULL" : node.getRackName(); - Set rackReservations = reservations.get(rackName); - if (rackReservations != null) { - rackReservations.remove(node.getNodeName()); + private void clearReservation(SchedulerNode node) { + try { + writeLock.lock(); + String rackName = node.getRackName() == null ? "NULL" : node.getRackName(); + Set rackReservations = reservations.get(rackName); + if (rackReservations != null) { + rackReservations.remove(node.getNodeName()); + } + } finally { + writeLock.unlock(); } } @@ -737,7 +758,8 @@ private Resource assignContainer(FSSchedulerNode node, boolean reserved) { // For each priority, see if we can schedule a node local, rack local // or off-switch request. Rack of off-switch requests may be delayed // (not scheduled) in order to promote better locality. - synchronized (this) { + try { + writeLock.lock(); for (SchedulerRequestKey schedulerKey : keysToTry) { // Skip it for reserved container, since // we already check it in isValidReservation. @@ -803,6 +825,8 @@ private Resource assignContainer(FSSchedulerNode node, boolean reserved) { } } } + } finally { + writeLock.unlock(); } return Resources.none(); } @@ -963,7 +987,8 @@ public void updateDemand() { Resources.addTo(demand, getCurrentConsumption()); // Add up outstanding resource requests - synchronized (this) { + try { + writeLock.lock(); for (SchedulerRequestKey k : getSchedulerKeys()) { ResourceRequest r = getResourceRequest(k, ResourceRequest.ANY); if (r != null) { @@ -971,6 +996,8 @@ public void updateDemand() { r.getCapability(), r.getNumContainers()); } } + } finally { + writeLock.unlock(); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index ac384a1..ca3101e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -824,7 +824,7 @@ private synchronized void removeApplicationAttempt( RMContainerEventType.KILL); } // Clean up pending requests, metrics etc. - attempt.stop(rmAppAttemptFinalState); + attempt.stop(); // Inform the queue FSLeafQueue queue = queueMgr.getLeafQueue(attempt.getQueue() diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java index 2863a97..35aa157 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java @@ -486,7 +486,7 @@ private synchronized void doneApplicationAttempt( } // Clean up pending requests, metrics etc. - attempt.stop(rmAppAttemptFinalState); + attempt.stop(); } /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java index a9f1f63..b389b6b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java @@ -449,7 +449,7 @@ private void checkCSLeafQueue(MockRM rm, assertEquals(absoluteUsedCapacity, leafQueue.getAbsoluteUsedCapacity(), 1e-8); // assert user consumed resources. - assertEquals(usedResource, leafQueue.getUser(app.getUser()) + assertEquals(usedResource, leafQueue.getOrDefault(app.getUser()) .getUsed()); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java index 3d3f1ea..84cee0f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java @@ -248,7 +248,7 @@ public Integer answer(InvocationOnMock invocation) throws Throwable { .isEmpty()) { LeafQueue queue = (LeafQueue) nameToCSQueues.get(queueName); Map> ignoreExclusivityContainers = - queue.getIgnoreExclusivityRMContainers(); + queue.getCopyOfIgnoreExclusivityRMContainers(); if (!ignoreExclusivityContainers.containsKey(partition)) { ignoreExclusivityContainers.put(partition, new TreeSet()); @@ -450,7 +450,7 @@ public Object answer(InvocationOnMock invocation) { Map> ignorePartitionContainers = new HashMap<>(); - when(leafQueue.getIgnoreExclusivityRMContainers()).thenReturn( + when(leafQueue.getCopyOfIgnoreExclusivityRMContainers()).thenReturn( ignorePartitionContainers); queue = leafQueue; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyMockFramework.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyMockFramework.java index 07d1eef..517c397 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyMockFramework.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyMockFramework.java @@ -110,7 +110,7 @@ public void testBuilder() throws Exception { // Check ignored partitioned containers in queue Assert.assertEquals(100, ((LeafQueue) cs.getQueue("a1")) - .getIgnoreExclusivityRMContainers().get("blue").size()); + .getCopyOfIgnoreExclusivityRMContainers().get("blue").size()); // Check applications Assert.assertEquals(2, ((LeafQueue)cs.getQueue("a1")).getApplications().size()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSGlobalScheduling.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSGlobalScheduling.java new file mode 100644 index 0000000..803788f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSGlobalScheduling.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; + +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.resourcemanager.MockAM; +import org.apache.hadoop.yarn.server.resourcemanager.MockNM; +import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager; +import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; +import org.apache.hadoop.yarn.server.resourcemanager.resource.Priority; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.apache.hadoop.yarn.util.resource.Resources; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; + +// TODO, writing tests here.. +public class TestCSGlobalScheduling { + private final int GB = 1024; + + private YarnConfiguration conf; + + RMNodeLabelsManager mgr; + + @Before + public void setUp() throws Exception { + conf = new YarnConfiguration(); + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + mgr = new NullRMNodeLabelsManager(); + mgr.init(conf); + } + + @Test + public void testSimpleGlobalScheduling() throws Exception { + // inject node label manager + conf.setBoolean(CapacitySchedulerConfiguration.SCHEDULE_GLOBALLY_ENABLE, true); + + MockRM rm1 = new MockRM(conf) { + @Override + public RMNodeLabelsManager createNodeLabelManager() { + return mgr; + } + }; + + rm1.getRMContext().setNodeLabelManager(mgr); + rm1.start(); + CapacityScheduler cs = + (CapacityScheduler) rm1.getRMContext().getScheduler(); + + MockNM nm1 = rm1.registerNode("h1:1234", 8000); // label = x + MockNM nm2 = rm1.registerNode("h2:1234", 8000); // label = y + MockNM nm3 = rm1.registerNode("h3:1234", 8000); // label = + + + // launch an app to queue a1 (label = x), and check all container will + // be allocated in h1 + RMApp app1 = rm1.submitApp(200, "app", "user", null); + MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm3); + + am1.allocate(Arrays.asList(ResourceRequest + .newInstance(Priority.create(0), "*", Resources.createResource(1024), + 3), ResourceRequest + .newInstance(Priority.create(1), "h1", Resources.createResource(1024), + 3), ResourceRequest + .newInstance(Priority.create(1), "h2", Resources.createResource(1024), + 1), ResourceRequest.newInstance(Priority.create(1), "/default-rack", + Resources.createResource(1024), 4), ResourceRequest + .newInstance(Priority.create(1), "*", Resources.createResource(1024), + 4)), null); + + + // request a container. + // am1.allocate("*", 1024, 1, new ArrayList()); + + Thread.sleep(10000); + Assert.assertEquals(8192, + cs.getClusterResourceUsage().getUsed().getMemorySize()); + + + rm1.close(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java index 09c16d0..20130f8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java @@ -842,7 +842,7 @@ public void testAllocateReorder() throws Exception { null, null, null, null); //And this will result in container assignment for app1 - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); //Verify that app1 is still first in assignment order //This happens because app2 has no demand/a magnitude of NaN, which @@ -1046,7 +1046,7 @@ public void testAsyncScheduling() throws Exception { // Now directly exercise the scheduling loop for (int i=0; i < NODES; ++i) { - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); } } @@ -2951,7 +2951,7 @@ public void testAMUsedResource() throws Exception { LeafQueue queueA = (LeafQueue) ((CapacityScheduler) scheduler).getQueue(queueName); assertEquals("Minimum Resource for AM is incorrect", minAllocResource, - queueA.getUser("user_0").getResourceUsage().getAMUsed()); + queueA.getOrDefault("user_0").getResourceUsage().getAMUsed()); rm.stop(); } @@ -3305,9 +3305,9 @@ private void verifyAMLimitForLeafQueue(CapacitySchedulerConfiguration config) queueA.getNumActiveApplications()); Assert.assertEquals("User PendingApplications should be 1", 1, queueA - .getUser(userName).getPendingApplications()); + .getOrDefault(userName).getPendingApplications()); Assert.assertEquals("User Active applications should be 1", 1, queueA - .getUser(userName).getActiveApplications()); + .getOrDefault(userName).getActiveApplications()); rm.stop(); } @@ -3511,7 +3511,7 @@ public void testCSReservationWithRootUnblocked() throws Exception { Collections.singletonList(y1Req), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); } assertEquals("Y1 Used Resource should be 4 GB", 4 * GB, cs.getQueue("y1").getUsedResources().getMemorySize()); @@ -3525,7 +3525,7 @@ public void testCSReservationWithRootUnblocked() throws Exception { Collections.singletonList(x1Req), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); } assertEquals("X1 Used Resource should be 7 GB", 7 * GB, cs.getQueue("x1").getUsedResources().getMemorySize()); @@ -3538,7 +3538,7 @@ public void testCSReservationWithRootUnblocked() throws Exception { Collections.singletonList(x2Req), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); assertEquals("X2 Used Resource should be 0", 0, cs.getQueue("x2").getUsedResources().getMemorySize()); assertEquals("P1 Used Resource should be 7 GB", 7 * GB, @@ -3550,7 +3550,7 @@ public void testCSReservationWithRootUnblocked() throws Exception { Collections.singletonList(x1Req), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); assertEquals("X1 Used Resource should be 7 GB", 7 * GB, cs.getQueue("x1").getUsedResources().getMemorySize()); assertEquals("P1 Used Resource should be 7 GB", 7 * GB, @@ -3564,7 +3564,7 @@ public void testCSReservationWithRootUnblocked() throws Exception { Collections.singletonList(y1Req), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); } assertEquals("P2 Used Resource should be 8 GB", 8 * GB, cs.getQueue("p2").getUsedResources().getMemorySize()); @@ -3574,7 +3574,7 @@ public void testCSReservationWithRootUnblocked() throws Exception { cs.handle(new ContainerExpiredSchedulerEvent(containerId)); //Schedule pending request - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); assertEquals("X2 Used Resource should be 2 GB", 2 * GB, cs.getQueue("x2").getUsedResources().getMemorySize()); assertEquals("P1 Used Resource should be 8 GB", 8 * GB, @@ -3623,7 +3623,7 @@ public void testCSQueueBlocked() throws Exception { cs.allocate(appAttemptId1, Collections.singletonList(r1), Collections.emptyList(), null, null, null, null).getContainers().size(); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); ResourceRequest r2 = null; for (int i =0; i < 13; i++) { r2 = TestUtils.createResourceRequest( @@ -3632,7 +3632,7 @@ public void testCSQueueBlocked() throws Exception { Collections.singletonList(r2), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); } assertEquals("A Used Resource should be 2 GB", 2 * GB, cs.getQueue("a").getUsedResources().getMemorySize()); @@ -3645,11 +3645,11 @@ public void testCSQueueBlocked() throws Exception { cs.allocate(appAttemptId1, Collections.singletonList(r1), Collections.emptyList(), null, null, null, null).getContainers().size(); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); cs.allocate(appAttemptId2, Collections.singletonList(r2), Collections.emptyList(), null, null, null, null); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); //Check blocked Resource assertEquals("A Used Resource should be 2 GB", 2 * GB, cs.getQueue("a").getUsedResources().getMemorySize()); @@ -3661,10 +3661,10 @@ public void testCSQueueBlocked() throws Exception { cs.handle(new ContainerExpiredSchedulerEvent(containerId1)); rm.drainEvents(); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); cs.handle(new ContainerExpiredSchedulerEvent(containerId2)); - CapacityScheduler.schedule(cs); + CapacityScheduler.AsyncScheduleThread.asyncSchedule(cs); rm.drainEvents(); assertEquals("A Used Resource should be 4 GB", 4 * GB, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNodeLabelUpdate.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNodeLabelUpdate.java index 9aef77c..eaddad7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNodeLabelUpdate.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNodeLabelUpdate.java @@ -135,7 +135,7 @@ private void checkUserUsedResource(MockRM rm, String queueName, String userName, String partition, int memory) { CapacityScheduler scheduler = (CapacityScheduler) rm.getResourceScheduler(); LeafQueue queue = (LeafQueue) scheduler.getQueue(queueName); - LeafQueue.User user = queue.getUser(userName); + LeafQueue.User user = queue.getOrDefault(userName); Assert.assertEquals(memory, user.getResourceUsage().getUsed(partition).getMemorySize()); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java index 499e041..1d83827 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java @@ -294,7 +294,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 9 * GB, null); Assert.assertEquals(9 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(3 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize()); Assert.assertEquals(6 * GB, @@ -319,7 +319,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 7 * GB, null); Assert.assertEquals(7 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(7 * GB, @@ -394,7 +394,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will *NOT* be updated checkUsedResource(rm1, "default", 3 * GB, null); Assert.assertEquals(3 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(3 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize()); Assert.assertEquals(0 * GB, @@ -475,7 +475,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 9 * GB, null); Assert.assertEquals(9 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(3 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize()); Assert.assertEquals(6 * GB, @@ -505,7 +505,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 1 * GB, null); Assert.assertEquals(1 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(1 * GB, @@ -587,7 +587,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 10 * GB, null); Assert.assertEquals(10 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(4 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize()); Assert.assertEquals(6 * GB, @@ -615,7 +615,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 1 * GB, null); Assert.assertEquals(1 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(1 * GB, @@ -696,7 +696,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 9 * GB, null); Assert.assertEquals(9 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(3 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize()); Assert.assertEquals(6 * GB, @@ -717,7 +717,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 1 * GB, null); Assert.assertEquals(1 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(1 * GB, @@ -793,7 +793,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 9 * GB, null); Assert.assertEquals(9 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(3 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize()); Assert.assertEquals(6 * GB, @@ -815,7 +815,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 0 * GB, null); Assert.assertEquals(0 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(0 * GB, @@ -920,7 +920,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 10 * GB, null); Assert.assertEquals(10 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(10 * GB, @@ -1001,7 +1001,7 @@ public RMNodeLabelsManager createNodeLabelManager() { // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 10 * GB, null); Assert.assertEquals(10 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + .getOrDefault("user").getUsed().getMemorySize()); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(10 * GB, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java index 274c063..295b511 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java @@ -412,7 +412,7 @@ public void testAppAttemptMetrics() throws Exception { assertEquals(1, a.getMetrics().getAppsSubmitted()); assertEquals(1, a.getMetrics().getAppsPending()); - assertEquals(1, a.getUser(user_0).getActiveApplications()); + assertEquals(1, a.getOrDefault(user_0).getActiveApplications()); assertEquals(app_1.getAMResource().getMemorySize(), a.getMetrics() .getUsedAMResourceMB()); assertEquals(app_1.getAMResource().getVirtualCores(), a.getMetrics() diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestNodeLabelContainerAllocation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestNodeLabelContainerAllocation.java index 9070577..8b1140c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestNodeLabelContainerAllocation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestNodeLabelContainerAllocation.java @@ -466,7 +466,7 @@ public RMNodeLabelsManager createNodeLabelManager() { rm1.close(); } - @Test (timeout = 120000) + @Test (timeout = 240000) public void testContainerReservationWithLabels() throws Exception { // This test is pretty much similar to testContainerAllocateWithLabel. // Difference is, this test doesn't specify label expression in @@ -731,13 +731,22 @@ public RMNodeLabelsManager createNodeLabelManager() { private void checkNumOfContainersInAnAppOnGivenNode(int expectedNum, NodeId nodeId, FiCaSchedulerApp app) { - int num = 0; - for (RMContainer container : app.getLiveContainers()) { - if (container.getAllocatedNode().equals(nodeId)) { - num++; + long start = System.currentTimeMillis(); + while (true) { + int num = 0; + for (RMContainer container : app.getLiveContainers()) { + if (container.getAllocatedNode().equals(nodeId)) { + num++; + } + } + if (expectedNum == num) { + return; + } else { + if (System.currentTimeMillis() - start > 2000) { + Assert.assertEquals(expectedNum, num); + } } } - Assert.assertEquals(expectedNum, num); } @Test @@ -1218,16 +1227,16 @@ public RMNodeLabelsManager createNodeLabelManager() { // check non-exclusive containers of LeafQueue is correctly updated LeafQueue leafQueue = (LeafQueue) cs.getQueue("a"); - Assert.assertFalse(leafQueue.getIgnoreExclusivityRMContainers().containsKey( + Assert.assertFalse(leafQueue.getCopyOfIgnoreExclusivityRMContainers().containsKey( "y")); Assert.assertEquals(10, - leafQueue.getIgnoreExclusivityRMContainers().get("x").size()); + leafQueue.getCopyOfIgnoreExclusivityRMContainers().get("x").size()); // completes all containers of app1, ignoreExclusivityRMContainers should be // updated as well. cs.handle(new AppAttemptRemovedSchedulerEvent( am1.getApplicationAttemptId(), RMAppAttemptState.FINISHED, false)); - Assert.assertFalse(leafQueue.getIgnoreExclusivityRMContainers().containsKey( + Assert.assertFalse(leafQueue.getCopyOfIgnoreExclusivityRMContainers().containsKey( "x")); rm1.close();